python 爬取豆瓣电影TOP250
- 利用第三方库requests实现请求
- xpath解析网页
- 采用MYSQL存储
-
库
-
requests
import requests
-
lxml
from lxml import etree (采用xpath解析网页)
-
pymysql
import pymysql (操作MYSQL)
数据库DDL
DROP TABLE db250;
CREATE TABLE db250
(
排名 INT DEFAULT '0' NOT NULL,
中文影名 VARCHAR(50) NULL,
英文影名 VARCHAR(80) NULL,
评分 FLOAT NULL,
导演 VARCHAR(60) NULL,
主演 VARCHAR(60) NULL,
时间 VARCHAR(50) NULL,
地区 VARCHAR(20) NULL,
类型 VARCHAR(15) NULL,
影评 VARCHAR(100) NULL
)
数据库链接
self.db = pymysql.Connect(host='127.0.0.1', port=3306, user='root',passwd='密码', db='数据库名', charset='utf8')
self.cursor = self.db.cursor()
构建网页链接
self.base_url = 'https://movie.douban.com/top250?start='
self.url_list = [self.base_url + str(i) + "&filter=" for i in range(0, 226, 25) ]
请求及解析网页
for url in self.url_list:
#请求
req = requests.get(url, headers=self.headers)
# 设置编码
req.encoding = 'utf-8'
#解析网页
root = etree.HTML(req.content)
texts = root.xpath('//ol/li/div[@class="item"]')
for text in texts:
# 初始化电影名
Chinese_Name = ''
English_Name = ''
director = 'NULL'
actor = 'NULL'
# 排名
rank = int(text.xpath('./div[@class="pic"]/em/text()')[0])
# 链接
movie_url = text.xpath('./div[@class="pic"]/a/@href')[0]
# 电影名
name = text.xpath('./div[@class="info"]//a/span[@class="title"]/text()')
# 导演和主演
director_actor = text.xpath('./div[@class="info"]/div[@class="bd"]//p[@class]/text()')
# 评分
rating_num = text.xpath('./div[@class="info"]//div[@class="star"]/span[@class="rating_num"]/text()')[0]
# 影评
quote = text.xpath('./div[@class="info"]//p[@class="quote"]/span[@class="inq"]/text()')
代码
import requests
import pymysql
from lxml import etree
class DB250_spride(object):
def __init__(self):
self.base_url = 'https://movie.douban.com/top250?start='
self.url_list = [self.base_url + str(i) + "&filter=" for i in range(0, 226, 25) ]
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
# 链接数据库
self.db = pymysql.Connect(host='127.0.0.1', port=3306, user='root',passwd='密码', db='yw_test', charset='utf8')
self.cursor = self.db.cursor()
def Parse_url_and_InsertDB(self):
for url in self.url_list:
#请求
req = requests.get(url, headers=self.headers)
# 设置编码
req.encoding = 'utf-8'
#解析网页
root = etree.HTML(req.content)
texts = root.xpath('//ol/li/div[@class="item"]')
for text in texts:
# 初始化电影名
Chinese_Name = ''
English_Name = ''
director = 'NULL'
actor = 'NULL'
# 排名
rank = int(text.xpath('./div[@class="pic"]/em/text()')[0])
# 链接
movie_url = text.xpath('./div[@class="pic"]/a/@href')[0]
# 电影名
name = text.xpath('./div[@class="info"]//a/span[@class="title"]/text()')
# 导演和主演
director_actor = text.xpath('./div[@class="info"]/div[@class="bd"]//p[@class]/text()')
# 评分
rating_num = text.xpath('./div[@class="info"]//div[@class="star"]/span[@class="rating_num"]/text()')[0]
# 影评
quote = text.xpath('./div[@class="info"]//p[@class="quote"]/span[@class="inq"]/text()')
if len(name) == 2:
Chinese_Name = name[0]
English_Name = str(name[1]).replace('/', "")
English_Name = English_Name.replace("'", "''")
English_Name = English_Name.lstrip()
else:
Chinese_Name = name[0]
# 过滤字符
director_actor[0] = director_actor[0].replace(' ', '')
director_actor[1] = director_actor[1].replace(' ', '')
director_actor[0] = director_actor[0].replace('\xa0'*3, ' ')
director_actor[1] = director_actor[1].replace('\xa0/\xa0', ' ')
director_actor[0] = director_actor[0].replace('\n', '')
print_length = len(director_actor[0]) + len(director_actor[1])+len(rating_num) + 20
print("*"*print_length, end='\n\n')
#print("排名:",rank,"中文影名:",Chinese_Name," 英文影名:", English_Name,"评分:", rating_num, end='\n')
#print(director_actor[0],director_actor[1],sep='\n')
print(rank)
print(Chinese_Name)
print(English_Name)
print(rating_num)
#提取导演名和主演名
if len(director_actor[0].split(' ')) == 2:
director, actor = director_actor[0].split(" ")[0], director_actor[0].split(" ")[1]
actor = actor.replace("主演:", '')
actor = actor.replace("'", "''") #字符串带有单引号,写入数据库要转义
else:
director = director_actor[0].split(" ")[0]
director = director.replace('导演:', '')
director = director.replace("'", "''") #字符串带有单引号,写入数据库要转义
#提取时间 地区 类型
year, location, types = director_actor[1].split(' ')[0], director_actor[1].split(' ')[1], director_actor[1].split(' ')[2]
print(year)
print(location)
print(types)
print(director)
print(actor)
# print(director_actor[0])
# print(director_actor[1])
# print(float(rating_num))
#有些电影没有评论
if quote:
print(quote[0])
quote = quote[0].replace("'", "''")
else:
quote = 'NULL'
print("*"*print_length, end='\n\n')
ISSQL = "insert into yw_test.db250(排名, 中文影名, 英文影名, 评分, 导演, 主演, 时间, 地区, 类型, 影评) values\
(%d, '%s', '%s', %.1f, '%s','%s', '%s', '%s', '%s', '%s')"\
%(rank, Chinese_Name, English_Name, float(rating_num), director, actor, year, location, types, quote)
try:
self.cursor.execute(ISSQL)
self.db.commit()
except Exception as e:
self.db.rollback()
print(e)
def CloseDB(self):
self.cursor.close()
self.db.close()
def main():
spride1 = DB250_spride()
spride1.Parse_url_and_InsertDB()
if __name__ == '__main__':
main()
- 技术有限,欢迎大家评论指出,一起交流学习。 ↩