多线程爬取豆瓣电影top250
之前写过一篇有关多线程爬虫的文章,里面对分析过程进行了详细的介绍,所以这里就不对过程进行分析了,如果你是刚接触爬虫的新手的话可以参考一下我之前写的爬虫:
https://blog.csdn.net/weixin_40481076/article/details/101312325
本次爬虫的目标网站:https://movie.douban.com/top250?
爬取信息:top250页面电影排名和电影名字、图片url,将图片下载下来,以电影排名+‘—’+电影名字命名图片
代码
程序实现多线程采用的是继承threading.Thread类重写一个新的线程类,爬虫的主要逻辑写在run方法里面。
import os
import threading
import urllib
import lxml
import requests
from lxml import etree
'''
多线程爬取豆瓣电影top250
'''
class ConsumerThread(threading.Thread):
def __init__(self,startUrl,headers,startNum,endNum,path,tname):
threading.Thread.__init__(self)
self.startUrl=startUrl
self.headers=headers
self.startNum=startNum
self.endNum=endNum
self.path=path
self.tname=tname
def run(self):
for page in range(self.startNum, self.endNum + 25, 25):
res = request_page(self.startUrl + str(page), self.headers)
# res.encoding = 'utf-8'
#print(res)
try:
res = lxml.etree.HTML(res)
for div in res.xpath("//div[@class='item']"):
try:
num=div.xpath('./div[1]/em[1]/text()')[0]
name = div.xpath('./div[2]/div[1]/a[1]/span[1]/text()')[0] # .extract()[0]
imageUrl = div.xpath('./div[1]/a[1]/img[1]/@src')[0] # .extract()[0]
print(name)
print(imageUrl)
# introduce=div.xpath('./div[2]/div[2]/p[1]/text()').extract()[0]
# dic = dict(zip('num','name','introduce','imageUrl'),[num,name,introduce,imageUrl])
print('线程' + self.tname + '正在下载图片')
print('')
download_pic(imageUrl, str(num)+'---'+name, self.path)
except Exception as e:
print(str(e))
continue
except Exception as e:
print(str(e))
continue
def request_page(startUrl,headers):
try:
res=requests.get(startUrl,headers=headers) #去掉headers
res.encoding="utf-8"
if res.status_code == 200:
return res.text
except requests.RequestException:
return None
def get_headers():
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
# 'Accept-Encoding': 'gzip, deflate, br', #导致页面乱码
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Host': 'movie.douban.com',
'Referer': 'https://movie.douban.com/top250?start=0&filter=',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
return headers
# def makeFile(path,fileName,):
# if not os.path.exists(path):
# os.makedirs(path)
def download_pic(url,name,path):
if not os.path.exists(path):
os.makedirs(path)
try:
res=urllib.request.urlopen(url,timeout=5).read()
with open(path+name+'.jpg','wb') as file:
file.write(res)
file.close()
except Exception as e:
print(str(e))
if __name__ == '__main__':
url = 'https://movie.douban.com/top250?start='
header=get_headers()
thread1 = ConsumerThread(url,header,0,100,'d:/download/豆瓣电影top250AA/','A')
thread2 = ConsumerThread(url,header,125,225,'d:/download/豆瓣电影top250BB/','B')
thread1.start()
thread2.start()
thread1.join()
thread2.join()
print('*'*10+'下载完成!'+'*'*10)
'''
https://movie.douban.com/top250?start=
'''