Python 爬取国家统计局官网中2018年所有省份下面的城市

# -*- coding: utf-8 -*-
'''
Python 爬取国家统计局官网中2018年所有城市数据
'''
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup
# BeautifulSoup库
def ProvenceGet(url,header):
    try:

        response = requests.get(url,header)
        print(response.status_code)  # 打印请求的状态码
        print(response.apparent_encoding) # 打印所爬取网页内容的编码方式
        response.encoding = 'GB2312' # 将对象编码转换成GB2312编码
        html = response.text
        #print(response.text)
    except Exception as e:
        print('网络不正常')

    soup = BeautifulSoup(html, 'lxml')  # 可以把网页代码整理出来
    #print(soup.prettify())
    # find中可以找到的是 ‘table’  ‘a' ......
    tb = soup.findAll('a')

    provence = []
    html_provence = []
    for city_html in tb:
        # lstrip(str) 删除开头处指定的字符串
        # rstrip(str) 删除结束处的指定字符
        #print(city_html)
        #print(city_html.attrs)
        city_html_str = str(city_html)
        a = city_html_str.lstrip('<a href="').rstrip('<br/></a>')
        city_html_list = a.split('">')
        #print(city_html_list)
        if city_html_list[1] != '京ICP备05034670号':
            provence.append(city_html_list[1])
            html_provence.append(city_html_list[0])
    provence_html_dict = dict(zip(provence,html_provence))
    return provence_html_dict
    #print(provence)
    #print(html_provence)

    pass
def city_get(url):
    print(url)
    res = requests.get(url)
    res.encoding = 'GB2312'  # 将对象编码转换成GB2312编码
    html = res.text
    soup_1 = BeautifulSoup(html, 'lxml')
    #print(soup_1.prettify())
    tb = soup_1.findAll('a')
    #print(tb)
    cities = []
    cit = []
    cit_a  = []
    for city in tb:
        #print(city)
        city_str = str(city)
        a = city_str.lstrip('<a href="').rstrip('<br/></a>')
        city_list = a.split('">')
        cities.append(city_list[1])
    for j in range(len(cities)):
        if j%2 ==1 and (cities[j] != '市辖区' and cities[j] != '县'):
            cit.append(cities[j])
    return cit

    pass



if __name__ == '__main__':
    url = r'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html'
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    }
    provence_dict = ProvenceGet(url=url,header=header)
    #print(provence_html)
    cities = []
    url_base = r'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/'
    for key, value in provence_dict.items():
        url = url_base + value
        #print(url)
        city = city_get(url)
        if city != []:
            cities.append(city)
        time.sleep(1)
    print(cities)
    dat_col = []
    for i in range(len(cities)):
        for j in range(len(cities[i])):
            if cities[i][j] != '省直辖县级行政区划':
                dat_col.append(cities[i][j])
    x = pd.DataFrame(dat_col)
    x.to_excel(r'./data/dat.xlsx',header=None,index=None)
        #print(key)
        #print(value)




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值