from threading import Thread
from queue import Queue
import requests
from lxml import etree
import os
class CrawlInfo(Thread):
def __init__(self,url_queue_list,html_queue,html_queue_list,href_queue,href_queue_list):
Thread.__init__(self)
self.url_queue_list = url_queue_list
self.html_queue = html_queue
self.html_queue_list = html_queue_list
self.href_queue = href_queue
self.href_queue_list = href_queue_list
def run(self):
headers = {
'Connection': 'close',
'User-Agent': 'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML,like Gecko)Chrome/86.0.4240.183 Safari/537.36'
}
while self.url_queue_list.empty() == False:
url_q = self.url_queue_list.get()
while url_q.empty() == False:
url = url_q.get()
reponse = requests.get(url=url,headers=headers)
page_text = reponse.text.encode('ISO-8859-1').decode('utf-8')
if reponse.status_code == 200:
self.html_queue.put(page_text)
self.href_queue.put(page_text)
self.html_queue_list.put(self.html_queue)
self.href_queue_list.put(self.href_queue)
class ParseInfo(Thread):
def __init__(self,html_queue_list,href_queue,href_queue_list):
Thread.__init__(self)
self.html_queue_list = html_queue_list
self.href_queue = href_queue
self.href_queue_list = href_queue_list
def run(self):
headers = {
'Connection': 'close',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
while self.html_queue_list.empty() == False:
html_q = self.html_queue_list.get()
tree1 = etree.HTML(html_q.get())
page_list = tree1.xpath('//div[@class="c_page"]//@href')
for page_g in page_list:
page_g = 'http://www.quantuwang.cc' + page_g
page_r = requests.get(url=page_g, headers=headers)
page_re = page_r.text.encode('ISO-8859-1').decode('utf-8')
if page_r.status_code == 200:
self.href_queue.put(page_re)
self.href_queue_list.put(self.href_queue)
class GetDownLoad(Thread):
def __init__(self,href_queue_list):
Thread.__init__(self)
self.href_queue_list = href_queue_list
def run(self):
requests.adapters.DEFAULT_RETRIES = 5
s = requests.Session()
s.keep_alive = False
headers = {
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
while self.href_queue_list.empty() == False:
href_q = self.href_queue_list.get()
while href_q.empty() == False:
tree_g = etree.HTML(href_q.get())
href_list_li = tree_g.xpath('//ul[@class="ul960c"]//@href')
for href_li in href_list_li:
count_href = []
href_li = 'http://www.quantuwang.cc' + href_li
count_href.append(href_li)
page2_g = s.get(url=href_li, headers=headers)
page_text2_g = page2_g.text.encode('ISO-8859-1').decode('utf-8')
tree2 = etree.HTML(page_text2_g)
href_list2 = tree2.xpath('//div[@class="c_page"]//@href')
title = tree2.xpath('/html/head/title/text()')[0]
title = title.replace("/",'')
title_path = './全图网' + f'/{title}'
if not os.path.exists(title_path):
os.mkdir(title_path)
for href2 in href_list2:
href2 = 'http://www.quantuwang.cc' + href2
count_href.append(href2)
for href3 in count_href:
page_text3 = s.get(url=href3,headers=headers).text
tree3 = etree.HTML(page_text3)
src = tree3.xpath('//div[@class="c_img"]//@src')
if len(src) != 0:
jpg_data = s.get(url=src[0],headers=headers).content
jpg_name = src[0].split('/')[-1]
jpg_path = title_path + '/' + jpg_name
with open(jpg_path, 'wb') as fp:
fp.write(jpg_data)
print(jpg_name, '下载完成')
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
html = 'http://www.quantuwang.cc/meinv/'
cata = requests.get(url=html, headers=headers)
cata = cata.text.encode('ISO-8859-1').decode('utf-8')
tree = etree.HTML(cata)
li_list = tree.xpath('/html/body/div[10]/div[2]/div[1]/div/ul[1]/li')
all_href_list = []
for li in li_list:
href0_list = []
href0 = 'http://www.quantuwang.cc' + li.xpath('./a/@href')[0]
href0_list.append(href0)
page_text0 = requests.get(url=href0, headers=headers)
page_text0 = page_text0.text.encode('ISO-8859-1').decode('utf-8')
tree0 = etree.HTML(page_text0)
href0_0_list = tree0.xpath('//div[@class="c_page"]//@href')
for href0_0 in href0_0_list:
href0_0 = 'http://www.quantuwang.cc' + href0_0
href0_list.append(href0_0)
all_href_list.append(href0_list)
url_queue_list = Queue()
html_queue_list = Queue()
href_queue_list = Queue()
html_queue = Queue()
href_queue = Queue()
for i in range(len(all_href_list)):
url_queue = Queue()
for j in range(len(all_href_list[i])):
url_queue.put(all_href_list[i][j])
url_queue_list.put(url_queue)
crawl_list = []
for i in range(50):
Crawl = CrawlInfo(url_queue_list,html_queue,html_queue_list,href_queue,href_queue_list)
crawl_list.append(Crawl)
Crawl.start()
for crawl in crawl_list:
crawl.join()
parse_list = []
for i in range(50):
parse = ParseInfo(html_queue_list,href_queue,href_queue_list)
parse_list.append(parse)
parse.start()
for parse in parse_list:
parse.join()
gdownl_list = []
for i in range(50):
downl = GetDownLoad(href_queue_list)
gdownl_list.append(downl)
downl.start()
for downl in gdownl_list:
downl.join()