1 需求
爬取中国经营报网站上与“贵州茅台”相关的新闻,并处理数据乱码。
2 代码实现
import re
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0'}
url = 'http://www.cb.com.cn/index/search/esSearch?search_text=贵州茅台'
data = requests.get(url=url, headers=headers).text
try:
data = data.encode('ISO-8859-1').decode('utf-8')
except:
try:
data = data.encode('ISO-8859-1').decode('gbk')
except:
data = data
p_title = '<a href=".*?title="(.*?)" class="opacity_hover_v8 w_fit">'
title = re.findall(p_title, data)
p_href = '<a href="(.*?)" target="_blank".*?class="opacity_hover_v8 w_fit">'
href = re.findall(p_href, data)
for index in range(len(title)):
title[index] = re.sub('<.*?>', '', title[index]).strip()
print(str(index + 1) + "." + title[index])
print("(" + href[index] + ")")