数据在页面中显示,但读取JSON返回值为空
查看页面源码发现,数据存储在<script>中的jsonString里
用BeautifulSoup找到页面中<script>中的JSON.stringify里的数据,并用json.loads下载为json数据格式,调用json_parse方法对数据进行处理
def spider_air(url):
# url 爬取
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text,"html.parser")
pattern = re.compile(r"JSON.stringify\((.*?)\);$", re.MULTILINE | re.DOTALL)
script = soup.find_all('script', string=pattern)
data_str = pattern.findall(str(script[2]))
data_json = json.loads(data_str[0], strict=False)
json_parse(data_json)
直接选取data里的标签值,循环写入write表格中
def json_parse(data):
# 对发送的json进行解析
for list in data:
TimePoint = list['TimePoint']
Latitude = list['Latitude']
Longitude = list['Longitude']
StationCode = list['StationCode']
PositionName = list['PositionName']
Area = list['Area']
PrimaryPollutant = list['PrimaryPollutant']
PM2_5 = list['PM2_5']
O3 = list['O3']
CO = list['CO']
NO2 = list['NO2']
Quality = list['Quality']
SO2 = list['SO2']
PM10 = list['PM10']
AQI = list['AQI']
O3_8h = list['O3_8h']
row =TimePoint,Latitude,Longitude,StationCode,PositionName,Area,PrimaryPollutant,PM2_5,O3,CO,NO2,Quality,SO2,PM10,AQI,O3_8h
# 将该行写入
write(row)
写入csv文件中,调用的是writerow方法
def write(row):
# 不设置newline会导致每次写入一行空行
with open('sxair.csv', 'a', encoding="utf-8",newline='') as f:
f_csv = csv.writer(f)
f_csv.writerow(row)
main方法,在初始化时写入标题,后续可以注释掉
if __name__ == '__main__':
with (open('...csv', 'a', encoding="utf-8",newline='') as f):
f_csv = csv.writer(f)
title ='TimePoint','Latitude','Longitude','StationCode','PositionName','Area','PrimaryPollutant','PM2_5','O3','CO','NO2','Quality','SO2','PM10','AQI','O3_8h'
f_csv.writerow(title)
spider_air('..')