最近在看崔老师的爬虫课程,跟着课程上的爬豆瓣书籍信息,看视频很快,但是到自己这边执行就非常慢,大概执行两小时才爬下十几行数据,这是不能忍受的,发现使用re_complile不知在哪卡着就非常慢,所以使用曲线方式,先抓取要爬的html部分,分步骤用re_compile来过滤,这个时候就快很多了,具体代码如下:
1 2 3 #!/usr/bin/env python 4 # -*- coding:utf-8 -*- 5 import re 6 import time 7 import requests 8 from requests.exceptions import RequestException 9 from bs4 import BeautifulSoup 10 11 12 print(time.asctime()) 13 14 def get_page(url): 15 try: 16 content = requests.get(url) 17 if content.status_code == 200: 18 return content.text 19 return None 20 except RequestException: 21 return None 22 23 def parse_page(html): 24 soup = BeautifulSoup(html,'lxml') 25 data = soup.select('.slide-list .info') #css选择器将html中info标签下取出 26 for html2 in data: 27 pattern = re.compile( 28 '<div.*?href="(.*?)".*?title="(.*?)".*?more-meta.*?author">(.*?)</span>.*?year">(.*?)</span>.*?</div>', 29 re.S) 30 items = re.findall(pattern,str(html2)) #由于HTML2类型为bs4.element.Tag的,在这需要转成str才能findall出来 31 for item in items: 32 yield { 33 'index': item[0], 34 'book': item[1], 35 'author': item[2], 36 'date': item[3] 37 } 38 39 40 def main(): 41 url = 'https://book.douban.com/' 42 html = get_page(url) 43 for data in parse_page(html): 44 print(data) 45 46 47 if __name__ == '__main__': 48 main()