# -*- coding:utf-8 -*- import re from urllib import request from tools import Tools,DBManager import time class QSBKSpider(object): def __init__(self): self.url = 'https://www.qiushibaike.com/text/' self.headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0' } self.html = '' def get_html(self): count = 1 while True: try: req = request.Request( url=self.url, headers=self.headers ) response = request.urlopen(req) self.html = response.read().decode('utf-8') except Exception as e: count += 1 if count > 5: print('当前页数据获取失败!') break print('%s,获取数据错误,正在尝试第%s次连接...' % (e, count)) else: break time.sleep(1) def parse_data(self): pattern = re.compile('<div class="author.*?<img src="(.*?)".*?<h2>(.*?)</h2>.*?"articleGender womenIcon">(.*?)</div>.*?<span>(.*?)</span>.*?<i class="number">(.*?)</i>.*?<i.*?>(.*?)</i>.*?', re.S) result = re.findall(pattern, self.html) for rs in result: data = list(rs) data[1] = Tools.strip_char(rs[1]) data[3] = Tools.strip_char(rs[3]) DBManager.insert_data(data) # 查找下一页的链接,先找"下一页"的字符,然后截取 index = self.html.find('class="next"') if index != -1: # 截取"下一页"前后部分的字符串 s = self.html[index-90:index] pattern = re.compile('href="(.*?)"') next_href = re.search(pattern, s) page = next_href.group(1).split('/')[-2] print('正在爬取第{}页'.format(page)) self.url = 'https://www.qiushibaike.com' + next_href.group(1) self.get_html() # 调用自身函数 解析数据 self.parse_data() else: print('没有下一页') return def start(self): self.get_html() self.parse_data() if __name__ == '__main__': # 连接数据库 DBManager.connect_db() # 爬虫 qsbk = QSBKSpider() qsbk.start() # 关闭数据库 DBManager.close_db()
基于python的-爬取糗事百科(主文件)
最新推荐文章于 2022-11-29 13:38:15 发布