BeautifulSoup煲汤

1、pip install beautifulsoup4

解析html文件:

import requests
from bs4 import BeautifulSoup
# 获取网页源代码,也可以右键实现查询源代码
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.prettify())

2、beautifulsoup库的基本元素

 

import requests
from bs4 import BeautifulSoup
# 获取网页源代码,也可以右键实现查询源代码
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
#print(soup.prettify())
#页面标题
print(soup.title)
# 链接标签(第一个a标签内容)
tag = soup.a
print(tag)
print(soup.a.name)
print(soup.a.parent.name)
print(soup.a.parent.parent.name)
print(tag.attrs)
print(tag.attrs['class'])
print(tag.attrs['href'])
print(type(tag.attrs))
print(type(tag))
print(soup.a.string)
print(soup.p)
print(soup.p.string)
print(type(soup.p.string))

 

from bs4 import BeautifulSoup
newsoup = BeautifulSoup("<b><!--This is a comment--></b><p> This is not a comment</p>","html.parser")
print(newsoup.b.string)
print(type(newsoup.b.string))
print(newsoup.p.string)
print(type(newsoup.p.string))

 3、基于bs4库的html内容遍历类型

标签树的下行遍历:

import requests
from bs4 import BeautifulSoup
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.head)
print(soup.head.contents)
print(soup.body.contents)
len(soup.body.contents)
print(soup.body.contents[0])
print(soup.body.contents[4])
# 遍历儿子节点
for child in soup.body.children:
	print(child)
# 遍历子孙节点
for child in soup.body.children:
	print(child)

标签树的上行遍历:

import requests
from bs4 import BeautifulSoup
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.title.parent)
print(soup.html.parent)
print(soup.parent)
# 标签数的上行遍历
for parent in soup.a.parents:
	if parent is None:
			print(parent)
	else:
			print(parent.name)

标签树的平行遍历: 

 

import requests
from bs4 import BeautifulSoup
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.a.next_sibling)
print(soup.a.next_sibling.next_sibling)
print(soup.a.previous_sibling)
print(soup.a.previous_sibling.previous_sibling)
print(soup.a.parent)
# 遍历后续节点
for sibling in soup.a.next_sibling:
	print(sibling)
for sibling in soup.a.previous_sibling:
	print(sibling)

 4、基于bs4库的html格式输出

import requests
from bs4 import BeautifulSoup
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.prettify())
print(soup.a.prettify())

编码问题:

5、信息标记的三种形式

<>+tag

 

有类型的键值对

 

无类型键值对

 

 

提取HTML中所有URL链接:

步骤:1 搜索所有<a>标签。2 解析<a>标签格式,提取href后的链接内容。

import requests
from bs4 import BeautifulSoup
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
for link in soup.find_all('a'):
	print(link.get('href'))

 基于bs4库的HTML内容查找方法:

find_all:

 

import requests
import re
from bs4 import BeautifulSoup
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.find_all('a'))
print(soup.find_all(['a','b']))
for tag in soup.find_all(True):	
	print(tag.name)
for tag in soup.find_all(re.compile('b')):
	print(tag.name)
print(soup.find_all('p','course'))
print(soup.find_all(id='link1'))
print(soup.find_all(id='link'))
print(soup.find_all(id=re.compile('link')))
print(soup.find_all('a',recursive=False))
print(soup.find_all(string="Basic Python"))
print(soup.find_all(string = re.compile("python")))

 示例一:

中国大学排名定向爬虫

#CrawUnivRankingA.py
import requests
from bs4 import BeautifulSoup
import bs4
 
def getHTMLText(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
 
def fillUnivList(ulist, html):
    soup = BeautifulSoup(html, "html.parser")
    for tr in soup.find('tbody').children:
        if isinstance(tr, bs4.element.Tag):
            tds = tr('td')
            ulist.append([tds[0].string, tds[1].string, tds[3].string])
 
def printUnivList(ulist, num):
    # 10表示字符换宽度
    tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"    # {3}表示format填充部分的第3个变量,中文填充。
    print(tplt.format("排名","学校名称","总分",chr(12288)))
    for i in range(num):
        u=ulist[i]
        print(tplt.format(u[0],u[1],u[2],chr(12288)))
     
def main():
    uinfo = []
    url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html'
    html = getHTMLText(url)
    fillUnivList(uinfo, html)
    printUnivList(uinfo, 40) # 20 univs
main()

format方法的格式:

 中文对齐问题的解决:

采用中文字符的空格填充chr(12288)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值