1. 安装python3.8版本
2. 安装Anaconda,用于构建虚拟的python环境以及包管理
3. 用pip命令安装需要的包
pip install requests
pip install beautifulsoup4 (备注: 用于从HTML或XML文件中提取数据)
pip install lxml (备注:如果外部源下载慢的话,可以百度用豆瓣源)
pip install fake-useragent (备注:用于生成http的head头)
豆瓣电影图片爬虫小例子
import requests
from fake_useragent import UserAgent
import os
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
download_path = './douban'
if not os.path.exists(download_path):
os.makedirs(download_path)
def download_pic(url):
ua = UserAgent()
headers = {'User-Agent': ua.chrome}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
content = soup.find('ul', class_="ui-slide-content")
images = content.find_all('img')
i = 0
for image in images:
print(i)
i += 1
name = image.get('alt')
link = image.get('src')
print(name)
print(link)
pos = name.find('/');
if pos > 0:
print(f'error{name}')
new_name = name[:pos] + name[pos + 1:]
print(f'new name {new_name}')
name = new_name
urlretrieve(link, f'{download_path}/{name}.jpg') # 也可以用requests + write文件的方式实现
print(f'down success{i}')
print(f'{url}所有电影图片下载完成')
def main():
download_pic('https://movie.douban.com/')
main()