本篇文章爬取豆瓣电影排行榜,使用的模块:第三方模块requests,正则模块,数据库MySQL等
import requests
import re
import pymysql
class DouBan:
def __init__(self):
self.baseurl = 'https://movie.douban.com/top250?start={}&filter='
#数据库相关的参数
self.host = 'localhost'
self.user = 'root'
self.pwd = '123456'
self.conn = pymysql.connect(self.host, self.user, self.pwd)
self.cur = self.conn.cursor()
def getHTml(self,url):
res = requests.get(url)
res.encoding='utf-8'
html = res.text
self.parseHtml(html)
def parseHtml(self,html):
reg = '<div class="info">[\s\S]*?<span class="title">([\s\S]*?)</span>[\s\S]*?<br>([\s\S]*?) / ([\s\S]*?) [\s\S]*?<span class="rating_num" property="v:average">([\s\S]*?)</span>'
p = re.compile(reg)
infolist = p.findall(html)
for info in infolist:
each_info=[]
for x in info:
x = x.strip()
each_info.append(x)
self.writeComment(each_info)
#--------------------保存到本地-----------------------
def writeComment(self,info):
try:
with open('豆瓣.txt','a') as f:
for each in info:
f.write(each+' ')
f.write('\n')
except:
print('打开文件错误')
#------------------保存到数据库mysql------------------
# def writeComment(self,info):
# self.cur.execute('create database if not exists maoyan;')
# self.cur.execute('use maoyan')
# self.cur.execute('create table if not exists maoyan(id int primary key auto_increment,moviename varchar(60),date char(100),country varchar(20),score varchar(10))default charset="utf8";')
# self.cur.execute('insert into maoyan(moviename,date,country,score) values("%s","%s","%s","%s");'%(info[0],info[1],info[2],info[3]))
# self.conn.commit()
#
def main(self):
# self.cur.execute('drop database maoyan;')
for i in range(0,10):
url = self.baseurl.format(i*25)
print(url)
self.getHTml(url)
print('写入完成')
if __name__=='__main__':
douban = DouBan()
douban.main()