来个简单的多线程,爬取速度比单线程要快很多,下面上代码:
#encoding:utf-8
#多线程爬取
import requests
import os
from bs4 import BeautifulSoup
import threading
import urllib.request
FIRST_PAGE_URL = 'http://www.qiubaichengren.com/{}.html'
PAGE_URL_LIST = []
IMG_URL_LIST = [] #所有的图片链接
NAME_LIST = []
gLock = threading.Lock()
for x in range(1,100):
page_url = FIRST_PAGE_URL.format(x)
PAGE_URL_LIST.append(page_url)
def get_page():
while True:
gLock.acquire()
if len(PAGE_URL_LIST) == 0:
gLock.release()
break
else:
page_url = PAGE_URL_LIST.pop()
gLock.release()
response = requests.get(page_url)
content = response.content
soup = BeautifulSoup(content, 'lxml')
src = soup.find_all('div', class_='mala-text')
imgs = soup.fi