单线程切换到多线程版本的思路:
多线程的数据是共享的,使用Queue是线程安全的。
多线程
config.py
# coding=utf-8
import re
PROXY_SITES = [
'http://cn-proxy.com',
'http://www.xicidaili.com',
'http://www.kuaidaili.com/free',
'http://www.proxylists.net/?HTTP',
# www.youdaili.net的地址随着日期不断更新
'http://www.youdaili.net/Daili/http/4565.html',
'http://www.youdaili.net/Daili/http/4562.html',
'http://www.kuaidaili.com',
'http://proxy.mimvp.com',
]
REFERER_LIST = [
'http://www.google.com/',
'http://www.bing.com/',
'http://www.baidu.com/',
]
PROXY_REGEX = re.compile('[0-9]+(?:\.[0-9]+){3}:\d{2,4}')
DB_HOST = 'localhost'
DB_PORT = 27017
DATABASE_NAME = 'chapter13'
TIMEOUT = 5
utils.py
# coding=utf-8
import random
import requests
from fake_useragent import UserAgent
from config import REFERER_LIST, TIMEOUT
def get_referer():
return random.choice(REFERER_LIST)
def get_user_agent():
ua = UserAgent()
return ua.random
def fetch(url, proxy=None):
s = requests.Session()
# print("get_user_agent: {}".format(get_user_agent()))
ua = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"
#生成ua的方式一:
s.headers.update({'user-agent': get_user_agent()})
#生成ua的方式二:
s.headers.update({'user-agent': ua})
proxies = None
if proxy is not None:
proxies = {
'http': proxy,
}
return s.get(url, timeout=TIMEOUT, proxies=proxies)
proxy_fetcher_with_queue.py queue版本的多线程获取代理ip,保存到内存和数据库中。
# coding=utf-8
import re
import queue
import threading
import requests
from mongoengine import NotUniqueError
from models import Proxy
from config import PROXY_REGEX, PROXY_SITES
from utils import fetch
def save_proxies(url):
proxies = []
try:
r = fetch(url)
except requests.exceptions.RequestException:
return False
addresses = re.findall(PROXY_REGEX, r.text)
for address in addresses:
proxy = Proxy(address=address)
try:
proxy.save()
except NotUniqueError:
pass
else:
proxies.append(address)#抓取数据保存成功后,缓存起来
print("save: {}".format(address))
return proxies
def cleanup():
Proxy.drop_collection()
def save_proxies_with_queue2(in_queue, out_queue):
while True:
url = in_queue.get()
rs = save_proxies(url)
out_queue.put(rs) #将缓存下了的数据保存到另一个队列中,再通过多线程执行保存
in_queue.task_done() # 队列完成发送信号,加了后最后队列join才能知道任务是否执行完了
def append_result(out_queue, result):
while True:
rs = out_queue.get()
if rs:
result.extend(rs)
out_queue.task_done()
def use_thread_with_queue2():
cleanup()
in_queue = queue.Queue()#线程安全
out_queue = queue.Queue()
for i in range(5):
t = threading.Thread(target=save_proxies_with_queue2,
args=(in_queue, out_queue))
t.setDaemon(True)
t.start()
for url in PROXY_SITES:
in_queue.put(url)
result = []
for i in range(5):
t = threading.Thread(target=append_result,
args=(out_queue, result))
t.setDaemon(True)
t.start()
in_queue.join()
out_queue.join()
print(len(result))
def save_proxies_with_queue(queue):
while 1:
url = queue.get()
save_proxies(url)
queue.task_done() # 队列完成发送信号
def use_thread_with_queue():
cleanup()
q = queue.Queue()
for i in range(5):
t = threading.Thread(target=save_proxies_with_queue, args=(q,))
t.setDaemon(True)
t.start()
for url in PROXY_SITES:
q.put(url)
q.join()
if __name__ == '__main__':
use_thread_with_queue2()