使用requests
import requests
import chardet
from lxml import etree
from bs4 import BeautifulSoup
res = requests.get("要爬取的网页")
res.encoding = chardet.detect(res.content)['encoding']
html = etree.HTML(res.text)
prefix = '网址的前缀'
links = html.xpath('//div[@class="mod-news-3"]/ul/li/a/@href')
item = {}
for i in range(len(links)):
url = links[i][1:]
url = prefix+str(url)
item[i] = url
使用 BeautifulSoup
from bs4 import BeautifulSoup
import requests
url = '要爬的网页'
r = requests.get(url)
r.encoding='UTF-8'
demo = r.text
soup = BeautifulSoup(demo, "html.parser")
"""
demo 表示被解析的html格式的内容
html.parser表示解析用的解析器
"""
print(soup)
print(soup.prettify())
print(soup.title)
print(soup.a)
print(soup.a.name)
print(soup.a.parent.name)
print(soup.a.parent.parent.name)