使用Python urllib爬取糗事百科段子
import re
import urllib.request
def getcontent(url,page):
#模拟成浏览器
headers = ('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0')
opener = urllib.request.build_opener()
opener.addheaders = [headers]
#将opener安装为全局
urllib.request.install_opener(opener)
data = urllib.request.urlopen(url).read().decode("utf-8")
#提取段子 正则表达式
pat_content = '<div class="content">.*?<span>(.*?)</span>'
#内容列表
contentlist = re.compile(pat_content,re.S).findall(data)
for content in contentlist:
content=content.replace('\n','')
print(content + '\n\n')
for i in range(1,2):
url = 'http://www.qiushibaike.com/8hr/page/'+str(i)
getcontent(url,i)
Python爬虫 爬取糗事百科段子
最新推荐文章于 2020-11-18 12:03:49 发布