import urllib.request import urllib.error import re headers = ("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36") opener = urllib.request.build_opener() opener.addheaders = [headers] urllib.request.install_opener(opener) for i in range(1,2): url = "https://www.qiushibaike.com/8hr/page/"+str(i)+"/" pagedata = urllib.request.urlopen(url).read().decode("utf-8","ignore") pattern = '<div class="content">.*?<span>(.*?)</span>(.*?)</div>' datalist = re.compile(pattern,re.S).findall(pagedata) for j in range(0,len(datalist)): print("第"+str(i)+"页第"+str(j)+"个段子内容是:") print(datalist[j])
普通爬虫(糗事百科)
最新推荐文章于 2023-04-17 11:15:54 发布