1.HTML解析器有很多种,比如:
解析工具 | 解析速度 | 难度 |
BeautifulSoup | 最慢 | 最简单 |
lxml | 快 | 简单 |
正则 | 最快 | 最难 |
2.Beautiful Soup解析工具的官方文档链接。
2.1获取所有"a"标签、2.2获取第2个"a"标签、2.3获取class='ulink'的"a"标签、2.4获取满足多个条件的"a"标签、2.5获取所有"a"标签的href属性、2.6获取纯文本text信息。示例代码如下:
# coding:utf-8
import requests
from lxml import etree
from bs4 import BeautifulSoup
import chardet
BASE_DOMAIN = "http://www.ygdy8.net"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"
}
def get_detailed_urls(url):
# 1.获取网页信息
response = requests.get(url, headers=HEADERS)
# 查看网页后发现,编码方式为“gb2312”charset
encode_style = chardet.detect(response.content)["encoding"]
# text = response.content.decode(encode_style, "ignore")
text = response.content.decode("gbk", "ignore")
# 2.对获取的text进行解析,解析成元素
soup = BeautifulSoup(text, "lxml")
# 2.1获取所有"a"标签
# all_a = soup.find_all("a")
# for i in all_a:
# print i
# # Tag类型
# # print type(i)
# # from bs4.element import Tag
# 2.2获取第2个"a"标签
# all_a = soup.find_all("a", limit=2)[1]
# print all_a
# 2.3获取class='ulink'的"a"标签
# # 方法一
# # all_a = soup.find_all("a", class_="ulink")
# # 方法二
# all_a = soup.find_all("a", attrs={"class": "ulink"})
# for i in all_a:
# print i
# 2.4获取满足多个条件的"a"标签
# 方法一
# all_a = soup.find_all("a", class_="ulink", href="/html/gndy/dyzz/20180605/56940.html")
# 方法二
# all_a = soup.find_all("a", attrs={"class": "ulink", "href": "/html/gndy/dyzz/20180605/56940.html"})
# for i in all_a:
# print i
# 2.5获取所有"a"标签的href属性
# all_a = soup.find_all("a")
# # for a in all_a:
# # # 方法一:通过下标的方式
# # # href = a["href"]
# # # print href
# # # 方法二:通过attrs属性的方式
# # href = a.attrs["href"]
# # print href
# 2.6获取纯文本text信息
all_a = soup.find_all("td", attrs={"colspan": "2"})[1:]
for a in all_a:
# 方法一:a.string
# print a.string
# print "="*30
# 方法二:a.strings
# infos = a.strings
# for info in infos:
# print info
# print "=" * 30
# 方法二:a.strings
# infos = list(a.strings)
# print infos
# 方法三:a.stripped_strings
# infos = a.stripped_strings
# for info in infos:
# print info
# print "=" * 30
# 方法四:a.get_text()
# infos = a.get_text()
# print infos
def spider():
# 1.获取第二页详细url
# url = "http://www.ygdy8.net/html/gndy/dyzz/index.html"
base_url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html"
for i in range(1, 8):
url = base_url.format(i)
get_detailed_urls(url)
break
if __name__ == '__main__':
spider()