前面做过的一个小项目,适合练手
import requests
import json
from bs4 import BeautifulSoup
def get_html():
# 请求获得html
url = 'http://seputu.com/'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0"}
response = requests.get(url, headers=headers)
return response.text
def parse_html(html_str):
# 解析html
soup = BeautifulSoup(html_str, features='lxml')
# print(soup.prettify())
# print("-" * 100)
content = []
for mulu in soup.find_all(class_="mulu"):
h2 = mulu.find('h2')
if h2:
# 获取标题
h2_title = h2.string
l = []
for box in soup.find_all(class_='box'):
for a in box.find_all('a'):
href = a.get('href')
box_title = a.string
l.append({'href': href, 'box_title': box_title})
content.append({'title': h2_title, 'content': l})
# 保存到本地为json文件
with open('18-盗墓笔记.json', 'w', encoding='utf8') as fp:
json.dump(content, fp=fp, indent=4)
def mian():
html_str = get_html()
parse_html(html_str)
if __name__ == '__main__':
mian()