在上次爬取信息的基础上下载图片。
下载挺容易的
在封面命名上画了大笔时间。。。。。
import requests
import bs4
from bs4 import BeautifulSoup
import re
url = 'http://maoyan.com/board/4'
path = 'C://Users//谢迎超//Desktop//download//'
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'maoyan.com',
'Referer': 'http://maoyan.com/board',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.349'
}
def GetText(url, header, offset):
# get网页源代码,猫眼电影榜每十个一夜通过params来切换页码,每一页的URL的不同在于“offset=0”,构造头模拟浏览器,猫眼disallow一般的爬虫。
try:
r = requests.get(url, params='offset=' + offset, headers=header)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def Text2html(text, ulist):
# 做soup,将网页源码转换成HTML。
soup = BeautifulSoup(text, 'html.parser')
for dds in soup.find_all('dd'): # 观察源代码可以发现电影信息隐藏在dd标签下,find_all筛选出所有的dd
if isinstance(dds, bs4.element.Tag): # 判断筛选出的dd是不是标签类型
a = dds.getText().replace('\n\n\n\n\n\n\n\n', ' ').replace(' ', '').replace('\n\n\n\n', '\n')
# getText获取标签内的文本,不会使用正则表达式,连续的replace同样可以去除多余的换行符和空格。
ulist.append([a]) # list的嵌套每一个电影信息是一个内部list
# for ps in dds.find_all('p'):
# ulist[k].append(ps.getText())
def Save2txt(ulist, path):
f = open(path+'movies.txt', 'w', encoding='UTF-8') # 文件写入用“UTF-8”码,否则报UnicodeEncodeError错误
k = len(ulist)
for i in range(k):
f.writelines(ulist[i])
f.close()
def Download_pic(ulist,text,k):
pattern= re.compile(r'http://p.*\.meituan.net/movie/.*\.jpg@160w_220h_1e_1c')#图片地址的正则表达式模式编译
for purl in pattern.findall(text):#findall找到页面中的十个封面链接循环
#print(purl)
#print(k)
#print(ulist[k][0].split("\n"))
plist=ulist[k][0].split("\n")#将字符串以'\n'分割成列表,将电影名字筛选出来
#print(plist[1])
pic=requests.get(purl)#获取图片
f= open(path+'pic//'+str(plist[1])+".jpg", 'wb')#以二进制模式打开文件夹
f.write(pic.content)
f.close()
k= k+1#(变个位数)
def main():
ulist = []
r=[]
for i in range(10):
offset = str(i * 10)
# r = requests.get(url, params='offset=' + offset, headers=header)
# print(r.url)
r.append( GetText(url, header, offset))#记录每页的网址以便下载图片
Text2html(r[i], ulist)
Save2txt(ulist, path)
for j in range(10):
#print(r[j])
k=j*10#每一个页面包含十个电影,每次下载十张,外部的k依次从0,10,20,30...变化(变十位数)
Download_pic(ulist,r[j],k)#为了将图片与电影名匹配将包含全部信息的ulist传进去,用k来做迭代器
#print(ulist)
main()
效果图: