为了提高数据获取的效率,我采取多进程爬虫,这里为什么没有采取多线程,有兴趣可以去了解python的多进程和多线程的区别,用法类似,请求页面的过程中加上强壮的异常处理,数据采集过程会很流畅,如下代码:
from bs4 import BeautifulSoup
from urllib import request,error
from multiprocessing import Process
import os
class GetData(Process):
def __init__(self,url,page):
super(GetData,self).__init__()
self.url = url
self.page = page
@classmethod
def download(cls,blockNum,blockSize,totalBlock):
pre = 100*blockNum*blockSize/totalBlock
if pre > 100:
pre = 100
print(f'当前下载进度{pre}%')
def run(self):
#自动创建目录保存数据
path = './pic/' + self.url + '/'
if not os.path.exists(path):
os.makedirs(path)
# 获取及下载过程
for i in range(1,self.page):
try:
print(f'进程 {os.getpid()} 开始爬取 {self.url} 第{i}页')
if i == 1:
url = f'http://pic.xxx.com/{self.url}/index.html'
else:
url = f'http://pic.xxx.com/{self.url}/index_{i}.html'
html = request.urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
imgs = soup.find_all('img')
for i in imgs:
name = i.get('alt')
src = 'http://pic.xxx(这里替换地址).com' + i.get('src')
print({'name': name, 'src': src})
if name and src:
request.urlretrieve(src, path + name + '.jpg', GetData.download)
except error.HTTPError as e:
if e.code in (403,404,502):
continue
except error.URLError as e:
continue
except Exception as e:
continue
if __name__ == '__main__':
for i in ['4kfengjing', '4kyingshi', '4kqiche', '4kdongwu']:
g = GetData(i,10)
g.start()
g.join()
============================================================================
以下是自动创建目录,并保存到本地的数据,加上os.getpid()是为了更直观的看到进程id不同