1.前言
后天结束隔离!
2.学习内容
2.1正则案例之分页爬取
import requests import re import os if __name__ == '__main__': headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3877.400 QQBrowser/10.8.4508.400' } # 创建一个文件夹用来保存所有图片 if not os.path.exists('.qiutuLibs'): os.mkdir('qiutuLibs') #设置一个通用的url模板! url = 'https://www.qiushibaike.com/imgrank/page/%d/' #pageNum=2 for pageNum in range(1,3): #对应页码的url new_url=format(url%pageNum) # 使用通用爬虫对url对应的一整张页面进行爬取 page_text = requests.get(url=new_url, headers=headers).text # 什么时候用post,什么时候用get?看抓包工具里的Request Method # 使用聚焦爬虫将页面中所有的糗图进行提取 # < div class ="thumb" > # < a href = "/article/124878565" target = "_blank" > # < img src = "//pic.qiushibaike.com/system/pictures/12487/124878565/medium/N81D0YEUYNJ0PS3G.jpg" alt = "糗事#124878565"class ="illustration" width="100%" height="auto" > # < / a > # < / div > ex = '<div class="thumb">.*?<img src=''(.*?)'' alt.*?</div>' # ? img_src_list = re.findall(ex, page_text, re.S) # re.S叫做单行匹配 re.M叫做多行匹配 # print(img_src_list)#一开始输出的空列表,后来换个headers解决了 for src in img_src_list: # 拼接出完整的图片地址 src = 'https:' + src # 请求到了图片二进制数据 img_data = requests.get(url=src, headers=headers).content # 生成图片 img_name = src.split('/')[-1] # -1就是倒数第一,也就是最后一个值 # 图片存储的路径 img_Path = './qiutuLibs/' + img_name with open(img_Path, 'wb') as fp: fp.write(img_data) print(img_name, '下载成功!!!!!')
2.2bs4解析基础
from bs4 import BeautifulSoup if __name__ == '__main__': #将本地的html文档中的数据加载到该对象中 fp=open('./test.html','r',encoding='utf-8') soup = BeautifulSoup(fp,'lxml') #print(soup) #print(soup.a)#soup.tagName返回的是html页面中第一次出现的tagName标签 #print(soup.div) #find('tagName')等同于soup.div #print(soup.find('div'))#print(soup.div) 二者相同 #print(soup.find('div',class_='song').string) #print(soup.find_all('a')) #print(soup.select('.tang'))# .表示类选择器,在此处表示选择tang这一类 标签:tag 类:.class id:#id print(soup.select('.tang > ul a')[0]['href'])
2.3bs4解析实战
import requests from bs4 import BeautifulSoup if __name__ == '__main__': #对首页的页面数据进行爬取 url='https://www.biquge05.com/shu164963/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3877.400 QQBrowser/10.8.4508.400' } page_text = requests.get(url=url, headers=headers).text #在首页中解析出章节的url与详情页的url #1.实例化BeautifulSoup对象,需要将页面源码数据加载到该对象中 soup=BeautifulSoup(page_text,'lxml') #解析章节标题和详情页的url fp=open('./linyuanxing.txt','w',encoding='utf-8') dd_list=soup.select('.box_con > id > dl > dd') for dd in dd_list: title = dd.a.string detail_url = 'https://www.biquge05.com/'+lst.a['herf'] #对详情页发起请求,解析出章节内容 detail_page_text=requests.get(url=detail_url,headers=headers).text #解析出详情页中相关的章节内容 detail_soup=BeautifulSoup(detail_page_text,'lxml') div_tag=detail_soup.find('div',id='content') #解析到了章节内容 content1=div_tag.text #持久化存储 fp.write(title+':'+'content'+'\n') print(title,'爬取成功!!!')
3.结束语
晚安,今晚不学习了,我要弹琴!!!