爬取背景:福建省发布了选考要求数据,想要获取数据进行分析,无奈数据量太大
需求分析:要爬取数据的网站为 http://fj.101.com/gaokao/#/,需要将数据存储为csv格式。
爬取代码如下
# coding=gbk
import requests # 引入爬虫所需的requests模块
from bs4 import BeautifulSoup # 引入BS模块
import json
import csv
for index_num in range(876):
index_num = str(index_num + 1)
base_url = 'https://wjt-subject-tool-api.sdp.101.com/v1/actions/manage?_=1567736178037&page={}&page_size=30&school_name=&subject_name='
target_url = base_url.format(index_num) # 拼接完整的目标URL
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
}
response = req