爬虫中对网页分析的xpath,css,标签使用的区别
#!C:\Python37 # -*- coding:utf-8 -*- import requests from bs4 import BeautifulSoup from lxml import etree url ="https://book.qidian.com/info/1016534035" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36" } def test_xpath(): respose = requests.get(url,headers=headers) html = etree.HTML(respose.text) infos = html.xpath('//div[@class="book-intro"]/p/text()') info='' for item in infos: info +=item info =str(info).replace('\r','').replace('\n','').replace(' ','') print(info) respose.close() def test_bs4(): ##使用css样式 respose = requests.get(url, headers=headers) soup = BeautifulSoup(respose.text,'lxml') #infos = soup.select('div.book-intro p')#方法一,标签直接使用,类属性使用".类属性值",id值采用"#id值" infos = soup.select('div[class="book-intro"] p')#方法二:属性使用方括号跟在相应标签后 for item in infos: print(item.text.replace(' ','')) pass #print(infos) ##使用标签选择 #infos2 = soup.find_all('div',class_="book-intro")[0]#方法一,属性值单独写,class属性要使用"class_"进行区分 infos2 = soup.find_all('div',attrs={"class":"book-intro"})[0]#方法一,属性做成字典,可以多属性做成一个attrs字典 infos2 = infos2.find_all('p')[0].text.replace(' ','') print(infos2,type(infos2),len(infos2)) if __name__ == '__main__': #test_xpath() test_bs4()