# !/user/bin/env python
# -*- coding:utf-8 -*-
import requests
from lxml import etree
from urllib import request
import time
from queue import Queue
import threading
class Procuder(object):
def __init__(self):
self.header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
# "Referer": "http://www.3btbtt.com/forum-index-fid-9-page-39.htm",
"Upgrade-Insecure-Requests": "1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control": "max-age=0",
"Connection": "keep-alive"
}
self.a = time.time()
self.coun = 4088
self.liss = Queue(40) # 保存每一页的url
self.lisss = Queue(2000) # 保存每一页里面的详情页url以及标题 [(详情页url,详情页标题),(详情页url,详情页标题),(详情页url,详情页标题)]
self.imm = Queue(5000) # 具体图片的url
def page_list(self):
"""构造页数"""
# print("zheshi hahahah ")
for x in range(15, 32):
self.liss.put("http://www.3btbtt.com/forum-index-fid-8-page-{}.htm".format(x))
return self.liss
def parse_page(self):
"""请求每一页,保存详情页"""
lis = self.page_list()
# print(lis)
cookie = "bbs_sid=a0240d740c505019; bbs_lastday=1567489378; timeoffset=%2B08; cck_lasttime=1567489932130; cck_count=0; bbs_page=2; bbs_lastonlineupdate=1567491527"
cooki = {i.split("=")[0]: i.split("=")[1] for i in cookie.split("; ")}
count = 0
while True:
li = lis.get()
count += 1
response = requests.get(li, headers=self.header, cookies=cooki,timeout=5)
respon = etree.HTML(response.content.decode("utf8"))
if count == 1:
con_list = respon.xpath("div[@id='threadlist']//table//td[1]/a[1]/@href")[2:] # 每一页的中的详情页的列表
con_title = respon.xpath("//div[@id='threadlist']//table//td[1]/a[2]/text()")[2:] # 每一页中的详情页的标题列表
list_title = list(zip(con_list, con_title))
for lists in list_title:
self.lisss.put(lists)
else:
con_list = respon.xpath("div[@id='threadlist']//table//td[1]/a[1]/@href") # 每一页的中的详情页的列表
# print(len(con_list))
con_title = respon.xpath("//div[@id='threadlist']//table//td[1]/a[2]/text()") # 每一页中的详情页的标题列表
# print(len(con_title))
list_title = list(zip(con_list, con_title))
for lists in list_title:
self.lisss.put(lists)
if lis.empty():
break
return self.lisss, cooki
def img_ur(self):
list_url, cookie = self.parse_page()
while True:
co = list_url.get()
for con in co:
print(con[0])
try:
response = requests.get(con[0], headers=self.header, cookies=cookie)
con_respon = etree.HTML(response.content.decode('utf8'))
img_urls = con_respon.xpath("//div[@class='message']/img/@src") # 获取每一张图片的url地址
for imgss in img_urls:
self.imm.put(imgss)
# print("这是每一详情页img_url的长度{}".format(len(img_urls)))
except:
print("这是访问详情页出错了")
if list_url.empty():
break
def download(self):
while True:
img_url = self.imm.get()
print("这是img_url——{}".format(img_url))
self.coun += 1
# time.sleep(0.5)
try:
request.urlretrieve(img_url, "image/" + "{}.jpg".format(self.coun))
print("{}.jpg".format(self.coun) + "下载完成")
except:
print("下载图片出错了........")
if self.imm.empty():
break
if __name__ == "__main__":
produc = Procuder()
h3 = threading.Thread(target=produc.img_ur)
h3.start()
time.sleep(10)
h1 = threading.Thread(target=produc.download)
h1.start()
h2 = threading.Thread(target=produc.download)
h2.start()