BeautifulSoup常用的四种对象(Type)
BeautifulSoup将复杂的HTML文档转换成一个复杂的树型结构,每个节点都是Python对象,所有对象可以归纳为4种:
Tag、NavigatableString、BeautifulSoup、Comment。Tag通俗来讲就是HTML中的一个个标签。
BeautifulSoup中的Select()方法
# coding:utf-8
import requests
from lxml import etree
from bs4 import BeautifulSoup
import chardet
BASE_DOMAIN = "http://www.ygdy8.net"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"
}
def get_detailed_urls(url):
# 1.获取网页信息
response = requests.get(url, headers=HEADERS)
# 查看网页后发现,编码方式为“gb2312”charset
encode_style = chardet.detect(response.content)["encoding"]
# text = response.content.decode(encode_style, "ignore")
text = response.content.decode("gbk", "ignore")
# 2.对获取的text进行解析,解析成元素
soup = BeautifulSoup(text, "lxml")
# 2.1获取所有"a"标签
# trs = soup.select("tr")
# for tr in trs:
# print tr
# 2.2获取第2个"a"标签
# trs = soup.select("tr")[1]
# print trs
# 2.3获取class为even的标签
# trs = soup.select("table.tbspan")
# for tr in trs:
# print tr
# 2.4获取所有a标签的herf属性
# trs = soup.select("a")
# for tr in trs:
# print tr["href"]
# 2.5获取所有的职位信息(text文本)
trs = soup.select("tr")
for tr in trs:
infos=list(tr.stripped_strings)
print infos
def spider():
# 1.获取第二页详细url
# url = "http://www.ygdy8.net/html/gndy/dyzz/index.html"
base_url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html"
for i in range(1, 8):
url = base_url.format(i)
get_detailed_urls(url)
break
if __name__ == '__main__':
spider()