爬取段子实例
仅供学习参考
import urllib.request
import ssl
import json
import re
def jokeCrawler(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1"
}
req = urllib.request.Request(url,headers=headers)
context = ssl._create_unverified_context()
response = urllib.request.urlopen(req,context=context)
HTML = response.read().decode('utf-8')
pat =r'<div class="author clearfix">(.*?)<span class="stats-vote"><i class="number">'
re_joke = re.compile(pat,re.S)
divslist = re_joke.findall(HTML)
#print(divslist)
#print(len(divslist))
dic = {}
for div in divslist:
#用户名:
re_u =re.compile(r'<h2>(.*?)</h2>',re.S)
username = re_u.findall(div)
username = username[0]
#print(username)
#print(type(username))
#段子
re_d = re.compile(r'<div class="content">\n<span>(.*?)</span>', re.S)
duanzi = re_d.findall(div)
duanzi = duanzi[0]
#print(duanzi)
#print(type(duanzi))
dic[username] = duanzi
return dic
#with open(r'/Users/fangjun/Desktop/PythonWork/file/file3.html','w') as f:
# f.write(HTML)
url = 'https://www.qiushibaike.com/text/'
info = jokeCrawler(url)
for k,v in info.items():
print(k,v)