Python 3.7.4
bs4==0.0.1
pandas==1.0.1
urllib3==1.24.2
re
实现代码
# -*- coding:utf-8 -*-import re
from bs4 import BeautifulSoup
import urllib
import pandas as pd
classSpider():'''
Description:
Spider program to crawl data from bilibili.com hot search rank list
Attributes:
None
'''def__init__(self):
self.url ='https://www.bilibili.com/ranking?spm_id_from=333.851.b_7072696d61727950616765546162.3'# regular matching pattern
self.pattern = re.compile(r'<a class="title" href="(.*?)" target="_blank">(.*?)</a>')
self.headers ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"}'''
Description:
crawl page from the given URL
Args:
url: the URL of page need to get
Returns:
page of the given URL
'''defcrawl(self):
self.request = urllib.request.Request(headers = self.headers, url = self.url)
self.response = urllib.request.urlopen(self.request)
page = self.response.read().decode('utf-8')return page
'''
Description:
extract data from the given page by bs4 and re library, return the list of data
Args:
None
Returns:
list of data extract from given page
'''defextract(self):
page = self.crawl()
beautifulSoup = BeautifulSoup(page,'html.parser')
results =[]for frame in beautifulSoup.find_all('div', class_ ='content'):
frame =str(frame)
result = re.findall(self.pattern, frame)[0]
results.append(result)return results
if __name__ =="__main__":
spider = Spider()
results = spider.extract()print(pd.DataFrame(results))