from bs4 import BeautifulSoup
import requests
import re
'''信息组织与提取方法'''# 信息标记的三种形式# XML JSON YAML
r = requests.get("http://python123.io/ws/demo.html")
r.encoding = r.apparent_encoding
demo = r.text
soup = BeautifulSoup(demo,"html.parser")defget_information_test():# 信息提取: 形式解析 搜索 融合for link in soup.find_all('a'):print(link.get('href'))# href为属性名for tag in soup.find_all(['a','b']):# 返回值为listprint(tag)print(len(soup.find_all(['a','b'])),type(soup.find_all(['a','b'])), soup.find_all(['a','b'])[0])for tag in soup.find_all(True):# 返回所有print(tag.name)for tag in soup.find_all(re.compile('b')):print(tag.name)print(soup.find_all('p','course'))# 第二个参数是属性,又course属性的p标签print(soup.find_all(id='link1'))# id域为link1print(soup.find_all(id=re.compile('link')))# 以link开头print(soup.find_all(string="Basic Python"))# 标签之间的内容为string参数print(soup.find_all(string=re.compile("python")))# 标签之间内容包含pythonprint(soup.find_all(string=re.compile("Python")))print(soup(True))# soup()等价于soup.find_all()print("------------------")print(type(soup.find('a')))print("------------------")print(type(soup.find_all('a')[0]))
get_information_test()