"""
example11 - 用数据库持久化爬虫数据
400 - Bad request.
401 - Unauthorized.
403 - Forbidden.
404 - Not Found.
405 - Method not allowed.
418 - I am a teapot.
create table `tb_top_movie`
(
`mov_id` bigint unsigned auto_increment comment '编号',
`mov_title` varchar(200) not null comment '标题',
`mov_rating_num` decimal(3,1) not null comment '评分',
`mov_comments_count` bigint not null comment '评论数',
primary key (`mov_id`)
) engine=innodb auto_increment=1001 comment '电影数据表';
Author: Hao
Date: 2022/5/28
"""
import bs4
import pymysql
import requests
from pymysql.cursors import Cursor
def fetch_page(session, url):
"""抓取页面
:param session: Session对象
:param url: 统一资源定位符
:return: 页面的HTML代码
"""
resp = session.get(url=url)
return resp.text if resp.status_code == 200 else ''
def parse_page(html_code):
"""解析页面
:param html_code: 页面的HTML代码
:return: 从页面解析出来的数据
"""
soup = bs4.BeautifulSoup(html_code, 'html.parser')
movie_items_list = soup.select('#content > div > div.article > ol > li')
data = []
for movie_item in movie_items_list:
title = movie_item.select_one('div > div.info > div.hd > a > span.title').text
rating_num = movie_item.select_one('div > div.info > div.bd > div > span.rating_num').text
comments_count = movie_item.select_one('div > div.info > div.bd > div > span:nth-child(4)').text[:-3]
data.append((title, rating_num, comments_count))
return data
def save_to_db(conn, data):
"""将数据保存到数据库
:param conn: 数据库连接
:param data: 数据
"""
with conn.cursor() as cursor: # type: Cursor
cursor.executemany(
'insert into tb_top_movie (mov_title, mov_rating_num, mov_comments_count) '
'values (%s, %s, %s)',
data
)
conn.commit()
def main():
session = requests.Session()
session.headers = {'User-Agent': 'Baiduspider'}
conn = pymysql.connect(host='localhost', port=3306,
user='guest', password='Guest.618',
database='hrs', charset='utf8mb4')
try:
for page in range(10):
url = f'https://movie.douban.com/top250?start={25 * page}'
html_code = fetch_page(session, url)
data = parse_page(html_code)
save_to_db(conn, data)
finally:
conn.close()
if __name__ == '__main__':
main()
【笔记】2022.5.28 从网页获取数据并写入数据库
于 2022-05-30 08:52:26 首次发布