1.爬取豆瓣电影网页源码。
import requests
url = 'https://movie.douban.com/top250?start='+str(start)+'&filter='
data = requests.get(url)
# print(data.text)
2.分析网页结构,爬取图片src,
https://movie.douban.com/top250?start=0&filter=
https://movie.douban.com/top250?start=25&filter=
......
#start为第几个(电影)开始
url = 'https://movie.douban.com/top250?start='+str(start)+'&filter='
data = requests.get(url)
# print(data.text)
page = etree.HTML(data.text)
imglink = page.xpath('//img/@src')
3.下载图片。
for i in imglink:
img = requests.get(i)
open(path+i.split('/')[-1],'wb').write(img.content) #i.split('/')[-1] 以src最后一个斜杠之后的字符命名图片。
print('正在保存……')
4.完整代码。
import requests
from lxml import etree
path = 'F:\\douban\\'
def main(start):
url = 'https://movie.douban.com/top250?start='+str(start)+'&filter='
data = requests.get(url)
# print(data.text)
page = etree.HTML(data.text)
imglink = page.xpath('//img/@src')
for i in imglink:
img = requests.get(i)
open(path+i.split('/')[-1],'wb').write(img.content)
print('正在保存……')
if __name__=='__main__':
for i in range(10):
main(start=i*25)
5.爬取结果。