import re
import threading
import time
from queue import Queue
import requests
from lxml import etree
class Wuso(object):
def __init__(self):
self.base_url = "https://wuso.me/forum-webcam-{}.html"
self.s = requests.Session()
self.headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome'
'/68.0.3440.84 Safari/537.36',
}
self.proxies = {"https": "https://127.0.0.1:1080"}
self.page_num = 1
self.url_queue = Queue()
self.data_queue = Queue()
def get_base_url_list(self):
for i in range(1, 265):
# 入队
self.url_queue.put(self.base_url.format(i))
def parse_url(self):
while True:
url = self.url_queue.get()
try:
res = self.s.get(url, headers=self.headers,
proxies=self.proxies).content.decode()
except Exception as e:
print("再次尝试列表页:", url)
self.url_queue.put(url)
self.parse_url()
res = etree.HTML(res)
detail_pages = res.xpath('//*[@id="waterfall"]/li')
print(len(detail_pages))
data_list = []
for page in detail_pages:
item = {}
item['title'] = page.xpath('.//h3/a/text()')[0]
item['likes_num'] = page.xpath('.//cite/text()')[0].split()[1]
item['detail_url'] = page.xpath('./div/h3/a/@href')[0]
item['video_url'] = self.get_video_url(item['detail_url'])
# print("item:",item)
self.data_queue.put(item)
self.url_queue.task_done()
def get_video_url(self, url):
try:
res = self.s.get(url, headers=self.headers,
proxies=self.proxies).content.decode()
except Exception as e:
print("再次尝试:", url)
self.get_video_url(url)
else:
try:
video_url = re.findall(r'{file: "(.*?)", primary', res)[0]
except IndexError:
print("没有解析到:", url)
video_url = "None"
print('video_url:', video_url)
return video_url
def save_info(self):
while True:
data = self.data_queue.get()
try:
with open("wuso.csv", "a+") as f:
f.write(
data['title']
+ "," + data['likes_num']
+ "," + data['detail_url']
+ "," + data['video_url']
+ "\n"
)
except Exception as e:
print("保存异常:", e)
print("第{}条".format(self.page_num))
self.page_num += 1
self.data_queue.task_done()
def login(self):
login_url = "https://wuso.me/member.php?mod=logging&action=" \
"login&loginsubmit=yes&infloat=yes&lssubmit=yes&inajax=1"
form_data = {
"username": "账号",
"password": "密码",
"quickforward": "yes",
"handlekey": "ls",
}
response = self.s.post(url=login_url, data=form_data, headers=self.headers,
proxies=self.proxies).content.decode()
print('登录:', response)
def run(self):
self.login()
thread_list = []
# 多线程
get_url_t = threading.Thread(target=self.get_base_url_list)
thread_list.append(get_url_t)
for i in range(20):
parse_t = threading.Thread(target=self.parse_url)
thread_list.append(parse_t)
for i in range(20):
save_t = threading.Thread(target=self.save_info)
thread_list.append(save_t)
for t in thread_list:
t.setDaemon(True) # 把子线程设置为守护线程
t.start()
for q in [self.url_queue, self.data_queue]:
q.join()
if __name__ == '__main__':
time1 = time.time()
wuso = Wuso()
wuso.run()
time2 = time.time()
print("over,用时:", (time2 - time1))
wuso爬虫--多线程+Queue
最新推荐文章于 2024-04-06 11:26:20 发布