代码在运行的时候,可能会进行403报错,主要是在爬取当个图片时,403,求求大佬帮助
(已解决)
import os.path
import threading
import requests
from lxml import etree
from urllib import request
import os
import re
from queue import Queue
#法一
def parse_page(url):
headers={'User-Agent':'',
'Referer':'https://m.doutub.com/',
'Cookie':''}##改成自己的
response=requests.get(url,headers=headers)
text=response.text
html=etree.HTML(text)
imgs=html.xpath("//div[@class='cell']//img")
for img in imgs:
img_url=img.get('data-src')
print(img_url)
img_name=img.get('alt')
img_name=re.sub(r'[\??\.。,!!]','',img_name)
suffix=os.path.splitext(img_url)[1]#分割后缀名
filename=img_name+suffix
##报错位置下
#request.urlretrieve(img_url,'images/'+filename)#保存地址
pic = requests.get(url=img_url,headers=headers).content
with open('images/'+ filename, 'wb') as f:
f.write(pic)
# break
def main():
for x in range(1,3):
url='https://www.doutub.com/img_lists/wang/%d'%x
parse_page(url)
break
if __name__ == '__main__':
main()
#%%
#法二
class Consumer(threading.Thread):
def __int__(self,page_queue,img_queue,*args,**kwargs):#任何参数
super(Consumer,self).__init__(*args,**kwargs)
self.page_queue=page_queue
self.img_queue=img_queue
def run(self):
while True:
if self.img_queue.empty() and self.page_queue.empty():
break
img_url,filename=self.img_queue.get()
request.urlretrieve(img_url,'images/'+filename)#保存地址
class Producer(threading.Thread):
def __int__(self,page_queue,img_queue,*args,**kwargs):#任何参数
super(Producer,self).__init__(*args,**kwargs)
self.page_queue=page_queue
self.img_queue=img_queue
def run(self):
while True:
if self.page_queue.empty():
break
url=self.page_queue.get()
self.parse_page(url)
def parse_page(self,url):
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
'Referer':'https://m.doutub.com/',
'Cookie':'Hm_lvt_0e35b200a6045f9dd8f8887cd4626da2=1695690284; Hm_lpvt_0e35b200a6045f9dd8f8887cd4626da2=1695690392'}
response=requests.get(url,headers=headers)
text=response.text
html=etree.HTML(text)
imgs=html.xpath("//div[@class='expression-list clearfix']//img")
for img in imgs:
img_url=img.get('data-src')
img_name=img.get('alt')
img_name=re.sub(r'[\??\.。,!!]','',img_name)
suffix=os.path.splitext(img_url)[1]#分割后缀名
filename=img_name+suffix
request.urlretrieve(img_url,'images/'+filename)#保存地址
self.img_queue.put((img_url,filename))
def main():
page_queue=Queue(2)
img_queue=Queue(20)
for x in range(1,3):
url='https://www.doutub.com/img_lists/wang/%d'%x
page_queue.put(url)
for x in range(5):
t=Producer(page_queue,img_queue)
t.start()
for x in range(5):
t=Consumer(page_queue,img_queue)
t.start()
if __name__ == '__main__':
main()