BeautifulSoup
- 导入类库
from bs4 import BeautifulSoup # 适用于简单页面
- 创建soup对象
soup = BeautifulSoup(test_data,'lxml')
- 标签特性
1 # print(soup.a) #获取指定标签 2 # print(soup.a['href']) #获取标签指定属性 3 # print(soup.a.contents,type(soup.a.contents)) #获取标签的文本,结果是列表 4 # print(soup.a.text,type(soup.a.text)) #获取标签的文本,结果是字符串
- 查找标签
- 查找所有a标签
- 查找特定id的a标签及其href
- 查找所有特定id的标签
1 # 查找所有a标签 2 # a_list = soup.find_all('a') 3 # for a in a_list: 4 # print(a['href']) 5 6 # 查找id为places_neighbours__row的a标签及其href 7 # a_place = soup.find_all('a',id='places_neighbours__row') 8 # for a in a_place: 9 # print(a['href']) 10 # 作用比上面更广,不指定标签,把具有id为places_neighbours__row的标签都找出来 11 # attrs_place = soup.find_all(attrs={'id':'places_neighbours__row'}) 12 # for attrs in attrs_place: 13 # print(attrs.name) #name为标签名字
lxml
/ 从根标签开始 // 从当前标签开始
*
通配符,选择所有 //div/book[2]/title[@lang="zh"] 选择当前div标签路径下第二个book(下标从1开始)的含有lang=zh属性的title元素
- 导入类库
import lxml.html
- 创建lxml对象
html = lxml.html.fromstring(test_data)
- 相对路径//
# 以下三种具有相同结果,使用相对路径最终都指向title,写得越简单需要搜索越久
# html_data = html.xpath('//div/book/title') # html_data = html.xpath('//book/title') # html_data = html.xpath('//title')
*
作用
# 把含有属性的title都选择出来
# html_data = html.xpath('//book/title[@*]') # 将title所有的属性值选择出来,选择出来的是属性值没有text # html_data = html.xpath('//book/title/@*')
- 内置text()函数
取出title的内容,是一个列表,不用text
# html_data = html.xpath('//book/title/text()')
# html_data = html.xpath('//div/ul/li[1]/a/text()')
- 逻辑关系and
# html_data = html.xpath('//a[@href="link1.html" and @id="places_neighbours__row"]/text()')
- 逻辑关系or
# html_data = html.xpath('//li[@class="item-1" or @class="item-0"]/a/text()')
- 不等于(!=)
# html_data = html.xpath('//li[@class!="item-1" and @class!="item-0"]/a/text()')
- last()
取指定标签的最后一个
# html_data = html.xpath('//div/book[last()-1]/title/text()')
- 比较关系
# html_data = html.xpath('//div/book[price > 39]/title/text()')
# html_data = html.xpath('//div/book[price >= 39.95]/title/text()')
- starts-with
# html_data = html.xpath('//li[starts-with(@class,"item")]/a/text()')
# html_data = html.xpath('//li[starts-with(@class,"g")]/a/text()')
- contains
# html_data = html.xpath('//li[contains(@class,"te")]/a/text()')
# html_data = html.xpath('//title[contains(@lang,"n")]/text()')
- 父子节点
# html_data = html.xpath('//book/descendant::*/text()')
# html_data = html.xpath('//book/ancestor::*') # 选出祖先节点
爬取百科文章
爬取前对网页源码进行分析,对要爬取的文章定位
1 # url_base = 'https://www.qiushibaike.com/8hr/page/2/' 2 # result = requests.get(url_base,headers=headers) 3 # html = lxml.html.fromstring(result.text) 4 # html_data = html.xpath('//div[@class="content"]/span[1]/text()') 5 # # print(html_data) 6 # for i in html_data: 7 # with open('./qiushi.txt','ab') as f: 8 # f.write(i.encode('utf-8')) 9 # print(result.text)
爬取百科图片
爬取前对网页源码进行分析,对要爬取的图片定位
1 # url_base = 'https://www.qiushibaike.com/imgrank/page/2/' 2 # result = requests.get(url_base,headers=headers) 3 # html = lxml.html.fromstring(result.text) 4 # html_data = html.xpath('//div[@class="thumb"]/a/img/@src') #提取图片地址 5 # # print(html_data[0][2:],type(str(html_data[0][2:]))) 6 # for i in html_data: 7 # photo_url = 'https:'+str(i) 8 # photo = requests.get(photo_url) 9 # p_num = html_data.index(i) + 1 10 # with open('./picture/'+str(p_num)+'.jpg','wb') as f: 11 # f.write(photo.content)
请使用手机"扫一扫"x