python 修改requests编码方式,解决乱码问题
获取网页响应的编码方式并赋值给requests请求的编码方法
text.encoding = resp.apparent_encoding
bs4的使用
import requests
import re
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.7 Safari/537.36'}
# 防反爬虫,模拟浏览器
def get_data():
url = 'http://wenming.enorth.com.cn/system/2022/05/05/052620332.shtml'
resp = requests.get(url, headers=headers)
resp.encoding = resp.apparent_encoding
# 修改requests编码方式
s = BeautifulSoup(resp.text,'html.parser')
items = s.find("tbody").find_all('p')
data = ''
for i in items:
item = re.findall(r'<p .*?>(.*?)</p>', str(i))
# data = str(item).replace(u'\u3000', u' ').replace(u'\xa0', u' ')
for j in item:
data = data+j
d = data.replace('<strong>', '').replace('</strong>', '')
return data
if __name__ == '__main__':
get_data()