import threading
import requests
import logging
import re
def thread_log(name):
log_fomrat = “%(asctime)s - %(levelname)s - %(name)s - %(thread)s - %(message)s”
logging.basicConfig(filename=name,level=logging.DEBUG,format=log_fomrat)
from bs4 import BeautifulSoup
url0 = ‘https://www.doutula.com/photo/list/?page=’
user_agent = ‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)’
headers = {‘User-Agent’:user_agent}
page_url_list = []
face_url_list = []
glock = threading.Lock()
for i in range(1,25):
url = url0+str(i)
page_url_list.append(url)
def spider():
while True:
glock.acquire()
if len(page_url_list) == 0:
glock.release()
break
else:
page_url = page_url_list.pop()
glock.release()
response = requests.get(page_url)
content = response.content
soup = BeautifulSoup(content,‘html’)
q = soup.find(‘div’,attrs={‘class’:‘page-content text-center’})
img_list = q.find_all(href=re.compile(“https://www.doutula.com/photo(.*?)”))
glock.acquire()
for imgurl in img_list:
u = imgurl.get(‘href’)
face_url_list.append(u)
print(face_url_list)
glock.release()
def download():
while True:
glock.acquire()
if len(face_url_list)==0:
glock.release()
continue
else:
face_url = face_url_list.pop()
glock.release()
html = requests.get(face_url,headers=headers).content
split_list = face_url.split(’/’)
filename = split_list.pop()+’.html’
print(filename)
with open(filename,‘wb’) as f:
print (html)
f.write(html)
# for face_url in face_url_list:
# html = requests.get(face_url,headers=headers).content
# # print (html)
# i = str(face_url)[-1:-7:-1]
# print(i)
# filename = str(i)+’.html’
# with open(filename,‘wb’) as f:
# print (html)
# f.write(html)
def main():
for x in range(5):
th = threading.Thread(target=spider)
th.start()
for x in range(2):
th = threading.Thread(target=download)
th.start()
thread_log(‘thread_log’)
if name ==‘main’:
main()
学习总结:
1.在该任务中学会了多进程与多线程的区别(进程包含线程),利用多线程可以加快爬虫的爬取速度
2.日志模块的使用,方便以后检查代码