使用requests和re模块抓取糗事百科
# -*- coding:utf-8 -*-
import requests
import re
class Qiushi_Spider(object):
def __init__(self):
self.url = "https://www.qiushibaike.com/text/page/{}/"
self.headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
def get_response(self,url):
response = requests.get(url,headers=self.headers)
data = response.text
return data
def parse_data(self,data):
contents = re.findall(r'<div class="content">.*?<span>(.*?)</span>',data,re.S)
qiushiduanzi = []
for content in contents:
content = re.sub('<.*?>'," ",content)
qiushiduanzi.append(content.strip())
print(qiushiduanzi)
def main(self):
for i in range(1,14):
url = self.url.format(i)
data = self.get_response(url)
self.parse_data(data)
if __name__ == '__main__':
Qiushi_Spider().main()