Python爬虫识别中文字符和标点符号,并且保存成txt文档
import requests
href_list = final_df["隐私政策"].values
names = final_df["公司名称"].values
for i in range(len(href_list)):
url = href_list[i]
if url:
try:
res = requests.get(url).text
except:
print("失败:",url)
continue
try:
res = res.encode("ISO-8859-1").decode("utf-8")
except:
try:
res = res.encode("ISO-8859-1").decode("gbk")
except:
res = res
res = re.sub("<.*?>", "", res)
res = re.sub("{.*?}", "", res)
res = res.replace('\n', '')
# 提取文字和标点符号!!!!!!!!!!!!!
pattern = re.compile(u'[\u4e00-\u9fa5-\,\。]')
result = pattern.findall(res)
# 保存到txt文档中
file = open("data/隐私政策/{}.txt".format(names[i]), 'w')
string = "".join(result)
file.write(string)