以糗事百科网站为例
文字笑话的URL格式是这样的:https://www.qiushibaike.com/text/page/1/ 一共13页
代码:
#糗事百科爬虫
import urllib.request
import re
def getcontent(url, page):
headers = ("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
file = urllib.request.urlopen(url).read()
file = file.decode('utf-8', 'ignore')
content_pattern = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
contents = re.compile(content_pattern, re.S).findall(file)
print('写入第' + str(page) + '页\n')
filehandle = open("F:/pythonpic/xiubai/" + str(page) + ".txt", 'w')
for content in contents:
content = content.replace("<br/>", "\n")
try:
filehandle.write(content)
except UnicodeError as u:
continue
filehandle.close()
for i in range(1, 14):
url = "https://www.qiushibaike.com/text/page/"+str(i)+"/"
getcontent(url, i)