# -*- coding:utf-8 -*-
from urllib import request
import re
url = 'http://www.qiushibaike.com/hot/'
headers = {'User-Agent':'Mozilla/5.0(Windows NT 6.1); WOW64'}
req = request.Request(url, headers = headers)
with request.urlopen(req) as f:
print('Status:', f.status, f.reason)
for k,v in f.getheaders():
print('%s: %s' % (k, v))
data = f.read().decode('utf-8')
pattern = re.compile('<div class="content">.*?<span>(.*?)</span>', re.S)
items = re.findall(pattern, data)
for index,item in enumerate(items):
br = re.compile('<br>|<br/>')
item = re.sub(br, '\n', item)
print('%d:\n%s\n' % (index, item))
python写网络爬虫:爬取糗事百科上的段子
最新推荐文章于 2024-09-21 17:56:48 发布