项目源码
import requests # 发起请求的库
from fake_useragent import UserAgent # 构造user-Agent的库
import mysql.connector # Python和MySQL连接的库
from typing import NoReturn, Tuple # 类型标注的库
from lxml import etree # 使用lxml提取html的库
import re # Python中的正则表达式库
import csv # 写入csv文件的库
import time # 程序休眠库
class bouban_Top250:
def __init__(self):
"""初始化"""
self.conn = mysql.connector.connect( # 构造和mysql的连接
host='localhost',
user='root',
passwd='123456',
port=3307,
charset='utf8',
database='reptile'
)
self.my_cursor = self.conn.cursor() # 创建油标
self.start_id = 1 # 电影id编号
self.all_movies_list = [] # 用来存储所有电影信息
def send_request(self, url: str) -> str:
"""发送get请求,将HTML页面源码返回
url:str类型,要请求的链接
return:str类型,请求的链接页面的html源码
"""
# 使用fake_useragent.UserAgent()构造请求头
headers = {
'user-Agent': UserAgent().Chrome,
}
# 发起请求
response = requests.get(url=url, headers=headers)
# 根据状态码来返回HTML页面源码
if response.status_code == 200:
return response.text
else:
print('status_code error')
return '-1'
def parse_html(self, html: str) -> Tuple[Tuple]:
"""解析html源码
html:str类型,html页面源码
return:Tuple[Tuple],元组中的每一个元素也是一个元组,每一个元素对应一个电影的所有信息
"""
# 初始化lxml
e = etree.HTML(html, etree.HTMLParser())
# 获取li标签列表,每一个li标签中包含一个电影的所有信息
li_list = e.xpath('//ol[@class="grid_view"]/li')
movies_tuple = []
# 遍历每一个li标签,对每一个li标签进行解析
for li in li_list:
try:
name = ''.join(li.xpath('./div/div[@class="info"]/div[@class="hd"]/a/span[@class="title"][1]/text()'))
href = ''.join(li.xpath('./div/div[@class="info"]/div[@class="hd"]/a/@href'))
performers = ''.join(li.xpath('string(./div/div[@class="info"]/div[@class="bd"]/p[1])'))
pattern = re.compile('\n+|\s+')
performers = re.sub(pattern=pattern, repl='', string=performers)
score = float(''.join(li.xpath('./div/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')))
quote = ''.join(li.xpath('./div/div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()'))
tem = (self.start_id, name, href, performers, score, quote)
movies_tuple.append(tem)
print(self.start_id, name, href, performers, score, quote, sep=" ")
self.start_id += 1
except:
continue
return tuple(movies_tuple)
def create_table(self, table_name: str) -> NoReturn:
"""在mysql数据库中创建表
table_name:str类型,要创建表的表名
"""
sql_create_table = "create table if not exists {}(\
ID int ,\
mv_name varchar(50),\
href varchar(50),\
performers varchar(300),\
score decimal(3, 1),\
quote varchar(50), \
primary key (ID)\
) ENGINE=INNODB DEFAULT CHARSET='utf8mb4' COLLATE='utf8mb4_unicode_ci'".format(table_name)
self.my_cursor.execute(sql_create_table)
def insert_to_mysql(self, table_name: str, movies_tuple: Tuple[Tuple]) -> NoReturn:
"""将传入的电影元组中的所有电影插入到表中
table_name:str类型,要插入信息的表的表名
movies_tuple:Tuple[Tuple]类型,每一个元素也是元组,其中包含一个电影的所有信息
"""
sql_insert = "insert into {} values (%s, %s, %s, %s, %s, %s)".format(table_name)
self.my_cursor.executemany(sql_insert, movies_tuple)
self.conn.commit()
def append_to_MV_list(self, movies_tuple: Tuple[Tuple]) -> NoReturn:
"""将电影元组中的所有电影添加到所有电影列表
movies_tuple:Tuple[Tuple]类型,要加入的电影元组
"""
for movie in movies_tuple:
self.all_movies_list.append(movie)
def writer_into_csv(self, file_name: str, movies_tuple: Tuple[Tuple]) -> NoReturn:
"""将传入的电影元组中的电影写入csv文件
file_name:str类型,要创建的csv文件的文件名
movies_tuple:Tuple[Tuple]类型,要写入的电影元组
"""
headers = ['编号', '电影名称', '电影链接', '演员信息', '豆瓣评分', '引述']
with open(file_name, 'w', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(movies_tuple)
def writer_into_txt(self, file_name: str, movies_tuple: Tuple[Tuple]) -> NoReturn:
"""将传入的电影元组写入到txt文件中
file_name:str类型,要创建的txt文件名
movies_tuple:Tuple[Tuple]类型,要写入的电影元组
"""
headers = ['编号', '电影名称', '电影链接', '演员信息', '豆瓣评分', '引述']
headers = ' '.join(headers)
with open(file_name, 'w', encoding='utf-8') as f:
f.write(headers+'\n')
for movie in movies_tuple:
movie = list(movie)
movie[0] = str(movie[0])
movie[4] = str(movie[4])
movie = str(' '.join(movie))
f.write(movie+'\n')
def get_next_page_url(self, html: str) -> str:
"""从html源码中获取下一页的链接
html:str类型,获取下一页的html源码
return:str类型,构造出来的下一页的url
"""
e = etree.HTML(html, etree.HTMLParser())
url = ''.join(e.xpath('//span[@class="next"]/a/@href'))
if url:
next_page_url = 'https://movie.douban.com/top250{}'.format(url)
return next_page_url
else:
return ''
if __name__ == '__main__':
"""程序入口"""
bouban = bouban_Top250()
url = 'https://movie.douban.com/top250?start=0&filter='
bouban.create_table('douban')
while url:
html = bouban.send_request(url)
url = bouban.get_next_page_url(html)
movies_tuple = bouban.parse_html(html)
bouban.append_to_MV_list(movies_tuple)
time.sleep(1)
all_movies_tuple = tuple(bouban.all_movies_list)
bouban.insert_to_mysql('douban', all_movies_tuple)
bouban.writer_into_txt('douban.txt', all_movies_tuple)
bouban.writer_into_csv('douban.csv', all_movies_tuple)
准备工作
- 安装好程序中用到的第三方库
- 安装好mysql,并将mysql的服务打开
- 在__init__方法初始化中,将mysql的用户名和密码、数据库更改为自己的,特别是端口号,因为我的电脑中有两个不同的mysql版本,所以我用的端口号是3307
- 确保自己的电脑有网
- 认真看代码,注释中都写了代码的功能,有一些代码看不懂要去查一查
- 我写本文的日期是2021.03 ,豆瓣更新反爬虫可能会导致爬取无效!
结果展示
心得
- re.sub()方法真是坑,它会将替换后的文本返回,不会直接对原来的文本做更改,弄得我花了几十分钟
- 将文本写入txt文件中不会自动换行,还要自己加换行符,害得我又删除->更改->删除…
- 将文本写入txt文件中不指定编码方式会导致乱码,我去
- 你看看下面的结果:
a1 = ((1, 2), (3, 4))
a2 = (('a', 'b'))
a3 = [1, 2]
a4 = [(1, 2)]
a1 + a4 = ?
a4.append(a1) = ?
a3 + a4 = ?
我又新建了好几个文件来实验,好烦
- 从编写到完成,我的mysql中的表已经删除了不少于20次