一、豆瓣电影图片爬取完整代码
import requests
from bs4 import BeautifulSoup
def rqdm(page): #page 0-10 的数字
cookies = {
'bid': 'SAuj7a0wxfE',
'_pk_ref.100001.4cf6': '%5B%22%22%2C%22%22%2C1715653858%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D6Y-aEUQlcoBy5UlLGRR5e168IWs1vBRTCigsJBcmo24lIcGUj8aWr0jBF3bSDN8k%26wd%3D%26eqid%3De5b7863d00011148000000036642ccde%22%5D',
'_pk_id.100001.4cf6': '909f5217bf632b62.1715653858.',
'_pk_ses.100001.4cf6': '1',
'ap_v': '0,6.0',
'__utma': '30149280.2082474441.1715653858.1715653858.1715653858.1',
'__utmb': '30149280.0.10.1715653858',
'__utmc': '30149280',
'__utmz': '30149280.1715653858.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic',
'__utma': '223695111.1139887529.1715653858.1715653858.1715653858.1',
'__utmb': '223695111.0.10.1715653858',
'__utmc': '223695111',
'__utmz': '223695111.1715653858.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic',
'__gads': 'ID=a237730cef6ecf8b:T=1715653859:RT=1715653859:S=ALNI_MYNw1QcI01XmONRVcnu8Ss_YVq3Ng',
'__gpi': 'UID=00000e1bb98e72ab:T=1715653859:RT=1715653859:S=ALNI_MYZpOdCEhfn0VtOcWSo0Xk7CIcB2A',
'__eoi': 'ID=8d7d8a24a7588f7d:T=1715653859:RT=1715653859:S=AA-AfjazxlgfDhvQPXw9IvwOChbu',
'__yadk_uid': 'YlTsOIy4V94FoIKvzG7ijYS30DHbVYsJ',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Referer': 'https://movie.douban.com/top250',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
params = {
'start': str(page*25),
'filter': '',
}
response = requests.get('https://movie.douban.com/top250', params=params, cookies=cookies, headers=headers)
return response.text
def getpic(pages):
for page in range(0,pages):
soup=BeautifulSoup(rqdm(page),'lxml')
lis=soup.find('div',class_='article').ol.findAll('li')
for i in range(len(lis)):
pic_alt=lis[i].find('div',class_='pic').a.img.get('alt')
pic_link=lis[i].find('div',class_='pic').a.img.get('src')
pic=requests.get(pic_link)
#time.sleep(3)
with open(f'results/{pic_alt}.webp','wb') as f: #注意图片的保存
f.write(pic.content)
if __name__=="__main()__":
getpic(5) # 我们这里只爬取5页,当然可以爬取全部
二、分步骤演示代码的写作
封装好的代码其实是一步一步的改写过来的,特别是初学者尤其应该如此。只有当后期非常熟练了,就可以跳过中间步骤。
第一步:请求网页
import requests
from bs4 import BeautifulSoup
cookies = {
'bid': 'SAuj7a0wxfE',
'_pk_ref.100001.4cf6': '%5B%22%22%2C%22%22%2C1715653858%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D6Y-aEUQlcoBy5UlLGRR5e168IWs1vBRTCigsJBcmo24lIcGUj8aWr0jBF3bSDN8k%26wd%3D%26eqid%3De5b7863d00011148000000036642ccde%22%5D',
'_pk_id.100001.4cf6': '909f5217bf632b62.1715653858.',
'_pk_ses.100001.4cf6': '1',
'ap_v': '0,6.0',
'__utma': '30149280.2082474441.1715653858.1715653858.1715653858.1',
'__utmb': '30149280.0.10.1715653858',
'__utmc': '30149280',
'__utmz': '30149280.1715653858.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic',
'__utma': '223695111.1139887529.1715653858.1715653858.1715653858.1',
'__utmb': '223695111.0.10.1715653858',
'__utmc': '223695111',
'__utmz': '223695111.1715653858.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic',
'__gads': 'ID=a237730cef6ecf8b:T=1715653859:RT=1715653859:S=ALNI_MYNw1QcI01XmONRVcnu8Ss_YVq3Ng',
'__gpi': 'UID=00000e1bb98e72ab:T=1715653859:RT=1715653859:S=ALNI_MYZpOdCEhfn0VtOcWSo0Xk7CIcB2A',
'__eoi': 'ID=8d7d8a24a7588f7d:T=1715653859:RT=1715653859:S=AA-AfjazxlgfDhvQPXw9IvwOChbu',
'__yadk_uid': 'YlTsOIy4V94FoIKvzG7ijYS30DHbVYsJ',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
# 'Cookie': 'bid=SAuj7a0wxfE; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1715653858%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D6Y-aEUQlcoBy5UlLGRR5e168IWs1vBRTCigsJBcmo24lIcGUj8aWr0jBF3bSDN8k%26wd%3D%26eqid%3De5b7863d00011148000000036642ccde%22%5D; _pk_id.100001.4cf6=909f5217bf632b62.1715653858.; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __utma=30149280.2082474441.1715653858.1715653858.1715653858.1; __utmb=30149280.0.10.1715653858; __utmc=30149280; __utmz=30149280.1715653858.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.1139887529.1715653858.1715653858.1715653858.1; __utmb=223695111.0.10.1715653858; __utmc=223695111; __utmz=223695111.1715653858.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __gads=ID=a237730cef6ecf8b:T=1715653859:RT=1715653859:S=ALNI_MYNw1QcI01XmONRVcnu8Ss_YVq3Ng; __gpi=UID=00000e1bb98e72ab:T=1715653859:RT=1715653859:S=ALNI_MYZpOdCEhfn0VtOcWSo0Xk7CIcB2A; __eoi=ID=8d7d8a24a7588f7d:T=1715653859:RT=1715653859:S=AA-AfjazxlgfDhvQPXw9IvwOChbu; __yadk_uid=YlTsOIy4V94FoIKvzG7ijYS30DHbVYsJ',
'Referer': 'https://movie.douban.com/top250',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
params = {
'start': '0',
'filter': '',
}
response = requests.get('https://movie.douban.com/top250', params=params, cookies=cookies, headers=headers)
response.text
### 第二步:解析网页
soup=BeautifulSoup(response.text,'lxml')
第三步:提取内容
lis=soup.find('div',class_='article').ol.findAll('li')
for i in range(len(lis)):
pic_alt=lis[i].find('div',class_='pic').a.img.get('alt')
pic_link=lis[i].find('div',class_='pic').a.img.get('src')
pic=requests.get(pic_link)
with open(f'results/{pic_alt}.webp','wb') as f: #注意图片的保存
f.write(pic.content)
第四步:循环爬取多页
for i in range(1,10):
url='https://movie.douban.com/top250?start={}&filter='.format(i*25)
print(url)
response = requests.get(url, cookies=cookies, headers=headers)
soup=BeautifulSoup(response.text,'lxml')
lis=soup.find('div',class_='article').ol.findAll('li')
for i in range(len(lis)):
pic_alt=lis[i].find('div',class_='pic').a.img.get('alt')
pic_link=lis[i].find('div',class_='pic').a.img.get('src')
pic=requests.get(pic_link)
#time.sleep(3)
with open(f'results/{pic_alt}.webp','wb') as f: #注意图片的保存
f.write(pic.content)
通过以上四个步骤,我们就可以轻而易举地把代码封装起来,方便下次直接调用。