from bs4 import BeautifulSoup
import xlwt
import requests
def ask_url(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.47"
}
try:
r = requests.get(url, headers=head, timeout=30)
r.raise_for_status()
r.encoding = 'gb18030'
return r.text
except:
return ""
def get_data(base_url):
data_list = []
for i in range(1, 64):
print('开始爬取第' + str(i) + '页数据--------------------')
url = base_url + str(64 - i) + '.htm'
html = ask_url(url)
if html == "":
continue
soup = BeautifulSoup(html, 'html.parser')
for item in soup.find_all('div', class_='DlistWfc'):
data = {}
title = item.h2.a.string.strip()
print(title)
sub_url = 'https:' + item.h2.a.attrs['href']
sub_html = ask_url(sub_url)
if sub_html == "":
continue
else:
data['title'] = title
sub_soup = BeautifulSoup(sub_html, 'html.parser')
content = sub_soup.find('div', class_='new_cont detail_con').get_text()
data['content'] = content.strip()
data_list.append(data)
return data_list
def save_data(data_list, save_path):
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet("急救常识数据集", cell_overwrite_ok=True)
col = ('title', 'content')
length = len(data_list)
print('共有' + str(length) + '条数据\n')
for i in range(0, 2):
sheet.write(0, i, col[i])
for i in range(0, length):
print("\r存储数据进度:{:.2f}%".format((i + 1) * 100 / length), end="")
data = data_list[i]
for j in range(0, 2):
if col[j] in data:
sheet.write(i + 1, j, data[col[j]])
book.save(save_path)
return ""
if __name__ == "__main__":
base_url = "https://www.99.com.cn/jijiu/jjcs/1229-"
save_path = "急救常识数据集.xls"
data_list = get_data(base_url)
save_data(data_list, save_path)