一个简单的入门级Python爬虫

一个简易爬虫,主要功能是爬取全国各地空气质量数据,输出到一个csv文件,后续加入相关数据处理得出实时空气质量排名前10和后10的城市及其空气质量(AQI)。

使用的是BeautifulSoup库,具体用法参照http://wiki.jikexueyuan.com/project/python-crawler-guide/beautiful-soup.html

import requests
from bs4 import BeautifulSoup
import csv


def get_city_aqi(city_pinyin):

    url = 'http://www.pm25.in/' + city_pinyin
    r = requests.get(url, timeout=30)
    soup = BeautifulSoup(r.text, 'lxml')
    div_list = soup.find_all('div', {'class', 'span1'})
    city_aqi = []
    for i in range(8):
        div_content = div_list[i]
        caption = div_content.find('div', {'class', 'caption'}).text.strip()
        value = div_content.find('div', {'class', 'value'}).text.strip()

        # city_aqi.append((caption, value))
        city_aqi.append(value)
    return city_aqi


def get_all_cities():
    url = 'http://www.pm25.in'
    city_list = []
    r = requests.get(url, timeout=30)
    soup = BeautifulSoup(r.text, 'lxml')
    city_div = soup.find_all('div', {'class', 'bottom'})[1]
    city_link_list = city_div.find_all('a')
    for city_link in city_link_list:
        city_name = city_link.text
        city_pinyin = city_link['href'][1:]
        city_list.append((city_name, city_pinyin))
    return city_list


def main():

    city_list = get_all_cities()
    header = ['City', 'AQI', 'PM2.5/h', 'PM10/h', 'CO/h', 'NO2/h', 'O3/h', 'O3/8h', 'SO2/h']
    with open('china_cities_aqi.csv', 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header)
        for i, city in enumerate(city_list):
            if(i + 1) % 10 == 0:
                print('已处理{}条记录(共{}条)'.format(i + 1, len(city_list)))
            city_name = city[0]
            city_pinyin = city[1]
            city_aqi = get_city_aqi(city_pinyin)
            row = [city_name] + city_aqi
            writer.writerow(row)


if __name__ == '__main__':
    main()

 

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值