先看效果:
说实话这次爬虫遇到了一点问题。当我将段子写入 txt 文件时发生如下错误:
查询资料后了解到, txt 文件的默认编码是 GBK 编码而不是 Unicode 编码,所以我们需要自己设置编码方式。解决方法:
导入模块:import codecs
打开文件时:with codecs.open(‘d:\file.txt’, ‘w’, ‘utf-8’)as file:
如此操作后可正常运行。参考链接:https://blog.csdn.net/yq0632/article/details/80254587
以下是代码内容:
导入模块:
import re
import codecs
from urllib import request, error
from bs4 import BeautifulSoup
获取 html 网页内容:
def askurl(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
req = request.Request(url=url, headers=headers)
respond = request.urlopen(req)
html = respond.read().decode('utf-8')
return html
except error.URLError as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, 'reason'):
print(e.reason)
解析网页:
FindName = re.compile(r'<h2>(.*?)</h2>', re.S)
FindContent = re.compile(r'<span>(.*?)</span>', re.S)
def getdata():
datalist = []
baseurl = 'https://www.qiushibaike.com/text/page/'
for i in range(10):
url = baseurl + str(i+1)
html = askurl(url)
soup = BeautifulSoup(html, 'html.parser')
for item in soup.find_all('div', {'class':'article block untagged mb15 typs_hot'}):
item = str(item)
name = FindName.findall(item)[0]
name = name.replace('\n', '')
content = FindContent.findall(item)[0]
content = content.replace('\n\n\n', '')
content = content.replace('<br/>', '\n')
datalist.append([name, content])
return datalist
保存数据:
def savedata(savepath):
datalist = getdata()
with codecs.open(savepath, 'w', 'utf-8') as file:
for i in range(len(datalist)):
file.write('用户'+str(i+1)+'是 --> ')
file.write(datalist[i][0])
file.write('\n')
file.write('段子 --> ')
file.write(datalist[i][1])
file.write('\n')
主函数运行:
def main():
savedata('d:\\糗事百科段子.txt')
if __name__=='__main__':
main()