目录
一.豆瓣250
# 导入必要的库:
import requests
from bs4 import BeautifulSoup
import pymysql
# 连接到MySQL数据库:
db = pymysql.connect(host="localhost", port=3306, user="root", passwd="root")
cursor = db.cursor()
cursor.execute("create database if not exists pydb")
cursor.execute("use pydb")
cursor.execute("create table if not exists pytable("
"id int primary key auto_increment comment '编号',"
"title varchar(255) not null comment '标题',"
"image varchar(255) not null comment '图片',"
"score varchar(255) not null comment '评分',"
"reviewerNum varchar(255) not null comment '评价人数')"
)
url = 'https://movie.douban.com/top250'
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"}
res = requests.get(url, headers=head)
print(res.status_code)
if res.ok:
list = []
content = res.text
soup = BeautifulSoup(content, 'html.parser')
ol_prices = soup.find_all('ol', attrs={'class': 'grid_view'})
for ol_price in ol_prices:
li_prices = ol_price.find_all('li')
for li_price in li_prices:
firTitle_price = li_price.find('span', attrs={'class': 'title'})
title = firTitle_price.text
pic_price = li_price.find('div', attrs={'class': 'pic'})
image_url = pic_price.find('img')['src']
star = li_price.find('div', attrs={'class': 'star'})
rating = star.find_all('span')
score = rating[1].text
reviewers_num = rating[3].text
value = (title, image_url, score, reviewers_num)
list.append(value)
print(value)
cursor.executemany("insert into pytable(title,image,score,reviewerNum) values(%s,%s,%s,%s)",list)
else:
print("请求失败")
db.commit()
db.close()
二.哔哩哔哩
import requests
from bs4 import BeautifulSoup
import pymysql
db = pymysql.connect(host="localhost", port=3306, user="root", passwd="root")
cursor = db.cursor()
cursor.execute("create database if not exists pydb")
cursor.execute("use pydb")
cursor.execute("create table if not exists bilibiliTable("
"id int primary key auto_increment comment '编号',"
"title varchar(255) not null comment '标题',"
"plays varchar(255) not null comment '播放量',"
"barrageNum varchar(255) not null comment '弹幕数',"
"duration varchar(255) not null comment '播放时长',"
"date varchar(255) not null comment '作者',"
"author varchar(255) not null comment '发布时间',"
"image varchar(255) not null comment '图片')"
)
url = 'https://www.bilibili.com/'
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"}
res = requests.get(url, headers=head)
print(res.status_code)
list = []
if res.ok:
soup = BeautifulSoup(res.text, 'html.parser')
feed2 = soup.find('div', class_='feed2')
feed_card = feed2.find_all('div', class_='feed-card')
for card in feed_card:
image = card.find('img')['src'] # 图片
print(card.find('img'))
bili_video_card_stats = card.find('div', class_='bili-video-card__stats')
bili_video_card_stats_text = bili_video_card_stats.find_all('span', class_='bili-video-card__stats--text')
flag = False
plays = '' # 播放量
barrageNum = '' # 弹幕数
for stats in bili_video_card_stats_text:
if flag == False:
flag = True
plays = stats.text
else:
barrageNum = stats.text
duration = bili_video_card_stats.find('span', class_='bili-video-card__stats__duration').text # 播放时长
bili_video_card_info_scale_disable = card.find('div', class_='bili-video-card__info __scale-disable')
title = bili_video_card_info_scale_disable.find('h3', class_='bili-video-card__info--tit').text # 标题
author = bili_video_card_info_scale_disable.find('span', class_='bili-video-card__info--author').text # 作者
date = bili_video_card_info_scale_disable.find('span', class_='bili-video-card__info--date').text # 发布时间
unitbili = (title, plays, barrageNum, duration, date, author, image)
# print(unitbili)
list.append(unitbili)
cursor.executemany("insert into bilibiliTable (title, plays, barrageNum, duration, date, author, image) VALUES (%s,%s,%s,%s,%s,%s,%s)",list)
db.commit()
db.close()
三.CSDN
# 导入必要的库:
import requests
from bs4 import BeautifulSoup
import pymysql
# 连接到MySQL数据库:
db = pymysql.connect(host="localhost", port=3306, user="root", passwd="root")
cursor = db.cursor()
# 创建数据库和表:
cursor.execute("create database if not exists pydb")
cursor.execute("use pydb")
cursor.execute("CREATE TABLE IF NOT EXISTS pyscdn (id INT PRIMARY KEY auto_increment COMMENT '编号',title VARCHAR (255) NOT NULL COMMENT '标题',supportNum VARCHAR (255) NOT NULL COMMENT '赞',author VARCHAR(255) NOT NULL COMMENT '作者')")
# 设置请求的URL和头信息:
url = 'https://www.csdn.net/'
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0"}
# 发送HTTP GET请求获取网页内容:
res = requests.get(url, headers=head)
print(res.status_code)
# 检查请求是否成功:
if res.ok:
list = []
# 解析网页内容并提取所需数据:
content = res.text
soup = BeautifulSoup(content, 'html.parser')
www_content = soup.find("div","www-content")
actives = www_content.find_all("div", class_="active")
for active in actives :
content = active.find("div", class_="content")
title = content.find("span", class_="blog-text").text
operation = content.find("div", class_="operation")
operation_b_img_operation_b_img_active = operation.find("p", class_="operation-b-img operation-b-img-active")
supportNum = operation_b_img_operation_b_img_active.find("span", class_="num").text
operation_c = operation.find("div", class_="operation-c")
author = operation_c.find("span").text
unitList = (title, author, supportNum)
list.append(unitList)
print(unitList)
# 将提取的数据插入数据库:
cursor.executemany("insert into pyscdn(title,supportNum,author) values(%s,%s,%s)",list)
else:
print("请求失败")
# 提交事务并关闭数据库连接:
db.commit()
db.close()
四.腾讯课堂
# 导入必要的库:
import requests
from bs4 import BeautifulSoup
import pymysql
# 连接到MySQL数据库:
db = pymysql.connect(host="localhost", port=3306, user="root", passwd="root")
cursor = db.cursor()
# 创建数据库和表:
cursor.execute("create database if not exists pydb")
cursor.execute("use pydb")
cursor.execute("CREATE TABLE IF NOT EXISTS pyscdn (id INT PRIMARY KEY auto_increment COMMENT '编号',title VARCHAR (255) NOT NULL COMMENT '标题',supportNum VARCHAR (255) NOT NULL COMMENT '赞',author VARCHAR(255) NOT NULL COMMENT '作者')")
# 设置请求的URL和头信息:
url = 'https://www.csdn.net/'
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0"}
# 发送HTTP GET请求获取网页内容:
res = requests.get(url, headers=head)
print(res.status_code)
# 检查请求是否成功:
if res.ok:
list = []
# 解析网页内容并提取所需数据:
content = res.text
soup = BeautifulSoup(content, 'html.parser')
www_content = soup.find("div","www-content")
actives = www_content.find_all("div", class_="active")
for active in actives :
content = active.find("div", class_="content")
title = content.find("span", class_="blog-text").text
operation = content.find("div", class_="operation")
operation_b_img_operation_b_img_active = operation.find("p", class_="operation-b-img operation-b-img-active")
supportNum = operation_b_img_operation_b_img_active.find("span", class_="num").text
operation_c = operation.find("div", class_="operation-c")
author = operation_c.find("span").text
unitList = (title, author, supportNum)
list.append(unitList)
print(unitList)
# 将提取的数据插入数据库:
cursor.executemany("insert into pyscdn(title,supportNum,author) values(%s,%s,%s)",list)
else:
print("请求失败")
# 提交事务并关闭数据库连接:
db.commit()
db.close()
五.美食图片
# 导入必要的库:
import requests
from bs4 import BeautifulSoup
import pymysql
# 连接到MySQL数据库:
db = pymysql.connect(host="localhost", port=3306, user="root", passwd="root")
cursor = db.cursor()
# 创建数据库和表:
cursor.execute("create database if not exists pydb")
cursor.execute("use pydb")
cursor.execute("CREATE TABLE IF NOT EXISTS pyscdn (id INT PRIMARY KEY auto_increment COMMENT '编号',title VARCHAR (255) NOT NULL COMMENT '标题',supportNum VARCHAR (255) NOT NULL COMMENT '赞',author VARCHAR(255) NOT NULL COMMENT '作者')")
# 设置请求的URL和头信息:
url = 'https://www.csdn.net/'
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0"}
# 发送HTTP GET请求获取网页内容:
res = requests.get(url, headers=head)
print(res.status_code)
# 检查请求是否成功:
if res.ok:
list = []
# 解析网页内容并提取所需数据:
content = res.text
soup = BeautifulSoup(content, 'html.parser')
www_content = soup.find("div","www-content")
actives = www_content.find_all("div", class_="active")
for active in actives :
content = active.find("div", class_="content")
title = content.find("span", class_="blog-text").text
operation = content.find("div", class_="operation")
operation_b_img_operation_b_img_active = operation.find("p", class_="operation-b-img operation-b-img-active")
supportNum = operation_b_img_operation_b_img_active.find("span", class_="num").text
operation_c = operation.find("div", class_="operation-c")
author = operation_c.find("span").text
unitList = (title, author, supportNum)
list.append(unitList)
print(unitList)
# 将提取的数据插入数据库:
cursor.executemany("insert into pyscdn(title,supportNum,author) values(%s,%s,%s)",list)
else:
print("请求失败")
# 提交事务并关闭数据库连接:
db.commit()
db.close()