import requests
url = 'https://www.csdn.net/'#原编码为utf-8
response = requests.get(url)
response.encoding = 'gbk'
print(response.encoding)
结果
gbk
response = requests.get(url)
response.encoding = 'gbk'
print(response.encoding)
实例
import requests
from lxml import etree
if __name__ == '__main__':
url = 'https://www.shicimingju.com/book/hongloumeng.html'
print(url)
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
# 爬取页面源码数据
# page_text = requests.get(url=url,headers=header).text
reponse = requests.get(url=url, headers=header)
# 手动设置响应数据编码格式
reponse.encoding = 'utf-8'#加上这句,才不会出现乱码
page_text = reponse.text
# 实例化etree
tree = etree.HTML(page_text)
# xpath表达式
# list1 = tree.xpath('//div[@class="card bookmark-list"]/div[4]/ul/li/a/text()')
list1 = tree.xpath('//div[@class="book-mulu"]/ul/li/a/text()')#我写
for li in list1:
title = li
print(title)