import requests from bs4 import BeautifulSoup import json import time """需求:爬取巴比特论坛的前5页title和内容""" class BtcSpider(object): def __init__(self): self.url = "https://bbs.8btc.com/forum-61-{}.html" self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.20 Safari/537.36" } # 保存列表页的数据 self.data_list = [] # 1.发送请求 def get_response(self, url): response = requests.get(url=url, headers=self.headers) data = response.content.decode() # print(data) return data # 2.解析数据list def parse_list_data(self, data): # 1.转类型 soup = BeautifulSoup(data, 'lxml') # 2.解析内容 取出所有类的选择器 A title_list = soup.find_all(attrs={'class': 'link-dark-major font-bold bbt-block'}) for title in title_list: list_dict_data = {} list_dict_data["title"] = title.get_text() list_dict_data["detail_url"] = title.get("href") self.data_list.append(list_dict_data) # 3.保存数据 def save_data(self, data, file_path): data_str = json.dumps(data) with open(file_path, 'w', encoding="utf-8")as f: f.write(data_str) # 4.启动 def start(self): # 列表页面请求(1-5页) for i in range(1, 6): url = self.url.format(i) # 发送请求 data = self.get_response(url) # 解析数据 self.parse_list_data(data) time.sleep(5.2) # 保存 self.save_data(self.data_list, "04-btc_list.json") if __name__ == "__main__": BtcSpider().start()