utf_list = ['utf', 'UTF']
gbk_list = ['gbk', 'GBK']
try:
wbdata = requests.get(url) # proxies=proxies,
# print(wbdata.text)
soup = BeautifulSoup(wbdata.text, 'lxml')
meta = re.findall('<meta .*?>', wbdata.text, re.S)
meta = str(meta)
# print('meta',meta)
head = meta
# print(head)
# print('soup find head', head)
if 'gb2312' in head:
wbdata.encoding = 'gb2312'
print('-------------->2312')
elif any(name in head for name in utf_list):
wbdata.encoding = 'utf-8'
print('-------------->utf')
# wbdata.decode('gb2312').encode('utf-8')
elif any(name in head for name in gbk_list):
wbdata.encoding = 'gbk'
print('--------------->gbk')
elif '18030' in head:
wbdata.encoding = 'gb18030'
print('status code', wbdata.status_code)
soup = BeautifulSoup(wbdata.text, 'lxml')
title = soup.find('title').text
print('title', title)
except:
print('time out------------------------------------>')
根据不同编码选择网页解码格式,避免解析时出现乱码, lxml默认解析成utf-8
最新推荐文章于 2024-04-23 09:42:36 发布