爬取猫眼电影TOP100(http://maoyan.com/board/4?offset=90)
1). 爬取内容: 电影名称,主演, 上映时间,图片url地址保存到mariadb数据库中;
2). 所有的图片保存到本地/mnt/maoyan/电影名.png
import re
from urllib.request import urlopen
from urllib import request
import pymysql
def getmovies():
for i in range(10):
url = 'http://maoyan.com/board/4?offset=%d' %(i*10)
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0'
req = request.Request(url, headers={'User-Agent':user_agent})
content = urlopen(req).read().decode('utf-8')
print("正在爬取地址")
pattern = r'<img data-src="(?P<picture>.+)" alt="(?P<name>[\u4e00-\u9fa5]+)" class="board-img" />'
movies = re.findall(pattern,content)
print(movies)
pattern2 = r'<p class="star">\s*(.+)\s*</p>'
star = re.findall(pattern2,content)
print(star)
pattern3 = r'<p class="releasetime">(.+)</p>'
time = re.findall(pattern3,content)
print(time)
conn = pymysql.connect(user='root',
password='971203', charset='utf8', autocommit=True)
cur = conn.cursor()
conn.select_db('bank')
# create_sql = 'create table movies2 (电影名字 varchar(60) not null , 主演 varchar(200) not null , 上映时间 varchar(50) not null , 图片url varchar(200) not null );'
# cur.execute(create_sql)
num = len(movies)
for i in range(num):
insert_sqli1 = 'insert into movies2 (电影名字,主演,上映时间,图片url) VALUES ("%s","%s","%s","%s");' % (movies[i][1],star[i],time[i],movies[i][0])
cur.execute(insert_sqli1)
cur.close()
conn.close()
for i in movies:
url = i[0]
content = urlopen(url).read()
with open('movies/%s.jpg' %(i[1]), 'wb+') as f:
f.write(content)
getmovies()