#coding=utf-8
import urllib
import urllib.request
from bs4 import BeautifulSoup
def public_decode(response):
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
# }
# response = requests.get('https://blog.csdn.net/a13951206104', headers=headers)
# html = etree.HTML(response.text)
# 不希望抓下来的数据中有非法字符
# item = dict()
result = None
for _charset_ in ['utf-8', 'gbk', 'gb2312', 'gb18030']:
if response:
print(_charset_)
result = response.encode(_charset_, errors='replace')
# item['content'] = html.xpath('//*[@id="content"]')
if response.find('�')<0 :
result =response
print(_charset_, '�')
return response
break
return result
# if not result:
# # 默认 utf-8
# result = response.content.decode(_charset_, errors='replace')
# 抓取页面方法,调用该方法返回抓取到数据
def read_pageHtml(url):
file = urllib.request.urlopen(url)
data = file.read()
return data
# 将数据生成txt文件方法 传入保存文件路径 storagePath 以及文件数据 data
def storageToLocalFiles(storagePath, data):
fhandle = open(storagePath,"a",encoding='utf-8',errors='replace')
fhandle.write(data)
fhandle.close()
# 传入需要抓取页面的链接 并调用抓取页面方法获得页面数据
url = "https://mp.weixin.qq.com/s/P5Cuk40ol-INRCkvSgsWvw"
def geturl(url):
data = read_pageHtml(url)
# 控制台打印数据
print(data)
soup = BeautifulSoup(data, 'html.parser')
data1 = soup.get_text()
# data1.encoding = 'utf-8'
# data1.encode('utf-8')
print(data1)
while data1.find('\n\n')>-1:
print('2n')
data1=data1.replace('\n\n','\n')
while data1.find('\u200b')>-1:
print('n')
data1=data1.replace('\u200b','\n')
while data1.find('\xa0')>-1:
print('xa0n')
data1=data1.replace('\xa0','\n')
while data1.find('\u3000')>-1:
print('u3000n')
data1=data1.replace('\u3000','\n')
while data1.find('\u2776')>-1:
print('u2776n')
data1=data1.replace('\u2776','\n')
# data1=data1.encode('utf-8', errors='replace')
# data1=data1.decode('utf-8', errors='replace')
# data1 = public_decode(data1)
print(data1)
# file = open('text2.txt', 'a')
# for i in data1:
# i=i.replace('\u200b', ' \n').replace('\u3000', ' ').replace('\xa0', ' ').strip()
# if len(i)!=0:
# file.write(i)
# file.close()
# 调用文件数据保存方法
storagePath = "pyweb.txt"
storageToLocalFiles(storagePath, '\n\n'+url+'\n\n')
storageToLocalFiles(storagePath, data1)
# geturl(url)
f=open('请先在这个文件输入要获取内容的通告网址.txt')
data = f.readlines() # 直接将文件中按行读到list里,效果与方法2一样
f.close() # 关
print(data) #返回list
for url in data:
url.replace('\n','')
if len(url)>1:
geturl(url)
print(url)
使用BeautifulSoup和Python从网页中提取文本保存到文本文件中并避免UnicodeDecodeError
最新推荐文章于 2023-09-05 18:52:00 发布