# -*- coding: utf-8 -*- import urllib2 import re page = 1 url = 'https://www.qiushibaike.com/' + str(page) user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} try: request = urllib2.Request(url, headers=headers) response = urllib2.urlopen(request) content = response.read().decode('utf-8') pattern = re.compile('<div class="author.*?>[\s\S]*?<a.*?>[\s\S]*?<img.*?alt=(.*?)>[\s\S]*?</div>[\s\S]*?' +'<div class="content">\n<span>([\s\S]*?)</span>[\s\S]*?<!--.*?-->([\s\S]*?)<div class="stats">', re.S) items = re.findall(pattern, content) for item in items: haveImg = re.search("img", item[2]) if not haveImg: print item[0],item[1] #print items except urllib2.URLError, e: if hasattr(e, "code"): print e.code if hasattr(e, "reason"): print e.reason
爬虫学习笔记1——爬取糗百段子
最新推荐文章于 2020-11-26 11:31:08 发布