这次的文章是对上一篇文章的扩展
在上一篇文章中我使用了程序自动进行分配多线程的方法,不过之前的有一个问题是他最多只能爬取16页的数据,因为之前的是把一个大列表划分为每一个列表都是4个元素的小列表,我们下只开了四个线程,因此就只能使用最多16个数字,但是今天我把之前的程序又改了一下,这次的功能是程序自动把一个大列表分为四个小列表,小列表里面的元素并不固定而是根据大列表元素数量的多少来决定的.
下面是将大列表分为四个小列表的代码>>>>
number = int(input('请输入要爬取的页数:'))
if number<=4:#判断如果输入的数字小于或等于4则就使用单线程进行操作
a=[i for i in range(1,number+1)]
a1_min=min(a)
a1_max=max(a)
print(a)#这里的a就是一个列表
else:
a = [i for i in range(1, number)]
d = number / 4 # 得到的数是一个浮点数
e = int(d) # 这一步是对浮点数变为整数,程序会将整数后面的小数全部清理这是的e就会小于d
if d > e: # 判断如果e小于b则就需要将每个小列表中的数量为e+1
step = e + 1
b = [a[i:i + step] for i in range(0, len(a), step)]
else:
step = e
#如果e=b则就表示输入的数字能被4整除这是我们就可以将列表数量除以4得到的结果作为每个小列表中元素的个数
b = [a[i:i + step] for i in range(0, len(a), step)]
print(b)
a1_max = max(b[0])
a1_min=min(b[0])
a2_max = max(b[1])
a2_min = min(b[1])
a3_max = max(b[2])
a3_min = min(b[2])
a4_max = max(b[3])
a4_min = min(b[3])
step=5#我们这里就先假设shtep=5
我们假设len(a)=20
b = [a[i:i + step] for i in range(0, len(a), step)]
for i in range(0,len(a),step):
print(i)
得到的结果为:0,5,10,15
a[i:1+step]#就等于a[0:0+5] 这里是一个循环
下面是实际的应用,还是上一篇文章中的实例,爬取必应壁纸
import requests,random,re,os
from urllib import request
from lxml import etree
from threading import Thread
user_agent = [
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
]
headers={"User-Agent": random.choice(user_agent)}
proxies={
'http':'http://163.204.243.251:9999',
'http':'http://60.13.42.115:9999',
'http':'http://182.35.86.234:9999',
'http':'http://60.13.42.147:9999',
'http':'http://60.13.42.22:9999',
'http':'http://182.35.82.192:9999',
'http':'http://60.13.42.124:9999',
'http':'http://111.230.99.192:8118',
'http':'http://163.204.244.220:9999',
'http':'http://163.204.247.47:9999'
}
class biying_img(Thread):
url='https://bing.ioliu.cn/?p='
def __init__(self, star_page, end_page,flie):
super(biying_img, self).__init__()
self.star_age = star_page
self.end_page = end_page
self.flie=flie
self.list=[]
self.title=[]
self.img=[]
def get_one_page_url(self,page):
'''查找首页图片的url'''
one_url=self.url+str(page)
r=requests.get(one_url,headers=headers,proxies=proxies,timeout=1)
url_list = re.findall(r'class="mark" href="(.*?)">', r.text, re.S)
for i in url_list:
url_headers = 'https://bing.ioliu.cn'
img_url = url_headers + i
self.list.append(img_url)
return self.list
def get_one_data(self,img_url):
'''查找单页的图片和名字'''
d = requests.get(img_url, headers=headers,proxies=proxies,timeout=1)
title = re.findall(r'<title>(.*?)</title>', d.text, re.S)
img = re.findall('img class.*?src="(.*?)" data-progressive', d.text, re.S)
for ti,ig in zip(title,img):
names=ti.split('(')[0]
ig_url=ig
self.title.append(names)
self.img.append(ig_url)
return zip(self.title,self.img)
def mkdirs(self,name):
flie='//Users/qq/desktop/{}/'.format(name)
if os.path.exists(flie):#判断保存的文件夹已经存在则打印已经存在字样
print('文件夹已存在')
else:#如果存在就新建文件夹
print('新建文件夹')
os.mkdir(flie)
return flie#返回文件夹路径
def down_img(self,data,name):
flie=self.mkdirs(name)
for title,img in data:
print('正在下载:',title)
print(img)
try:
# request.urlretrieve(img,flie+title+img[-4:])
down_img=requests.get(img,headers=headers,proxies=proxies,timeout=1).content
with open('{}{}.jpg'.format(flie,title),'wb')as f:
f.write(down_img)
except Exception as e:
pass
def run(self):
for page in range(self.star_age,self.end_page+1):
img_list=self.get_one_page_url(page)
for img in img_list:
data=self.get_one_data(img)
self.down_img(data,self.flie)
def main():
down=biying_img(1,3,'壁纸')
down.start()
down1=biying_img(7,10,'壁纸')
down1.start()
def main():
number = int(input('请输入要爬取的页数:'))
a = [i for i in range(0, number)]
step = 4
b = [a[i:i + step] for i in range(0, len(a), step)]
a1_max = max(b[0])
a1_min=min(b[0])
a2_max = max(b[1])
a2_min = min(b[1])
a3_max = max(b[2])
a3_min = min(b[2])
a4_max = max(b[3])
a4_min = min(b[4])
down=biying_img(a1_min,a1_max,'img')
down.start()
down1=biying_img(a2_min,a2_max,'img')
down1.start()
down2=biying_img(a3_min,a3_max,'img')
down2.start()
down3=biying_img(a4_min,a4_max,'img')
down3.start()
def main():
number = int(input('请输入要爬取的页数:'))
if number <= 4:
a = [i for i in range(1, number)]
a1_min = min(a)
a1_max = max(a)
down = biying_img(a1_min, a1_max, 'img')
down.start()
else:
a = [i for i in range(1, number)]
d = number / 4 # 得到的数是一个浮点数
e = int(d) # 这一步是对浮点数变为整数,程序会将整数后面的小数全部清理这是的e就会小于d
if d > e: # 判断如果e小于b则就需要将每个小列表中的数量为e+1
step = e + 1
b = [a[i:i + step] for i in range(0, len(a), step)]
else:
step = e
b = [a[i:i + step] for i in range(0, len(a), step)]
a1_max = max(b[0])
a1_min = min(b[0])
a2_max = max(b[1])
a2_min = min(b[1])
a3_max = max(b[2])
a3_min = min(b[2])
a4_max = max(b[3])
a4_min = min(b[3])
down = biying_img(a1_min, a1_max, 'img')
down.start()
down1 = biying_img(a2_min, a2_max, 'img')
down1.start()
down2 = biying_img(a3_min, a3_max, 'img')
down2.start()
down3 = biying_img(a4_min, a4_max, 'img')
down3.start()
if __name__ == '__main__':
main()