一些分析
完整代码
douban_comment_spider.py
import requests
urls = [
f"https://movie.douban.com/subject/5989818/reviews?start={page}"
for page in range(0, 150+1, 20)
]
def crawl(url):
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
r = requests.get(url, headers=headers)
print(url, len(r.text)) # 验证爬取到了内容
pass
multi_thread_crawl.py
import douban_comment_spider as DCSpider
import threading
import time
def single_thread():
""" 单线程 """
for url in DCSpider.urls:
DCSpider.crawl(url)
def multi_thread():
""" 多线程 """
threads = []
for url in DCSpider.urls:
threads.append(
threading.Thread(target=DCSpider.crawl, args=(url,)) # url加逗号,否则被认为是字符串
)
for i in range(len(threads)):
print("thread", i, "starts")
threads[i].start() # 分配内存
print("thread", i, "starts over")
for i in range(len(threads)):
print("thread", i, "joins")
threads[i].join() # 分配内存
print("thread", i, "joins over")
if __name__ == "__main__":
t1 = time.time()
single_thread()
t2 = time.time()
print("single_thread costs", t2-t1, "seconds")
t1 = time.time()
multi_thread()
t2 = time.time()
print("multi_thread costs", t2-t1, "seconds")