首先导入requests,BeautifulSoup,其中Requests采用开源协议的HTTP库,满足HTTP的的测试需求,其方法包括
requests.get():http链接,其中输入参数url以及headers。返回值就是网页的response。
BeautifulSoup:通过解析文档为用户提供抓取数据
通过网页源码找出需要数据文件层次结构,遍历所有同层结构获取数据,利用 find 方法取值,减少脏读
安装:pip install requests
pip install BeautifulSoup
爬取http://www.xiaohuar.com所有图片并写入文件夹
导入
import requests from bs4 import BeautifulSoup
1:for循环
for j in range(0,45): # %j 替代 %s response=requests.get(url='http://www.xiaohuar.com/list-1-%s.html'%j) response_text=response.text soup=BeautifulSoup(response_text,'html.parser') div_=soup.find(name='div',attrs={'id':'list_img'}) # div_=soup.find(name='div',attrs={'class':'infinite_scroll'}) div_list=div_.find_all(name='div',attrs={'class':'img'}) for i in div_list: span=i.find(name='span') name=span.text a_url = i.find(name='a').get('href') img_url =i.find(name='img').get('src') if img_url[0]!= 'h': #补全不完整路径 img_url = 'http://www.xiaohuar.com' + i.find(name='img').get('src') #获取最后的图片名 img_path = img_url.rsplit('/', 1)[-1] img_response = requests.get(img_url) print(name,a_url, img_url,' ',img_response.content) #打开文件夹并存入图片 f = open('./img/%s' % img_path, 'wb') f.write(img_response.content) f.close()
2.多线程
def work(i): response=requests.get(url='http://www.xiaohuar.com/list-1-%s.html'%i) soup=BeautifulSoup (response .text,'html.parser') div_ = soup.find(name='div', attrs={'class': 'infinite_scroll'}) div_list = div_.find_all(name='img') for i in div_list: img_url =i.get('src') if img_url[0]!= 'h': img_url = 'http://www.xiaohuar.com' + i.get('src') print(img_url, response.url) name=i.get('alt') img_path = name+img_url.rsplit('/', 1)[-1] img_response = requests.get(url=img_url) f = open('./img/%s' % img_path, 'wb') f.write(img_response.content) f.close() def main(): t=ThreadPoolExecutor(2) for i in range(0,2): t.submit(work,i) t.shutdown() if __name__=='__main__': import time start=time.time() main() print('共耗时:',time.time()-start)