# -*- coding: utf-8 -*-
import pandas as pd
import requests
import urllib.request
import json
from bs4 import BeautifulSoup
baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'}
def getPro(f):
resp = requests.get(baseUrl,headers=headers)
#resp.encoding = 'utf-8'
html = resp.content.decode("gbk","ignore")
# print(html)
soup = BeautifulSoup(html,"html.parser")
proArr = soup.find_all("tr",class_="provincetr")
for pro in proArr:
tdArr = pro.find_all("td")
for td in tdArr:
if td.text != '':
# print(td.text)
getCity(f,td.text,td.find("a")["href"])
def getCity(f,proName,url):
resp = requests.get(baseUrl+url,headers=headers)
html = resp.content.decode("gbk","ignore")
soup = BeautifulSoup(html,"html.parser")
cityArr = soup.find_all('tr',class_="citytr")
bodyUrl=url.split(".")[0]
#print(bodyUrl)
for city in cityArr:
cityName=city.find_all("td")[1].text
# print(proName,cityName)
if city.find("a") is not None:
getDistict(f,proName,cityName,bodyUrl,city.find("a")["href"])
def getDistict(f,proName,cityName,bodyUrl,url):
resp = requests.get(baseUrl + url, headers=headers)
html = resp.content.decode("gbk","ignore")
soup = BeautifulSoup(html,"html.parser")
distictArr = soup.find_all('tr', class_="countytr")
for dis in distictArr:
disName=dis.find_all("td")[1].text
# print(proName, cityName,disName,bodyUrl+"/"+dis.find("a")["href"])
if dis.find("a") is not None:
getTown(f,proName, cityName,disName,bodyUrl+"/"+dis.find("a")["href"])
def getTown(f,proName, cityName,disName,url):
resp = requests.get(baseUrl + url, headers=headers)
html = resp.content.decode("gbk","ignore")
soup = BeautifulSoup(html,"html.parser")
townArr = soup.find_all('tr', class_="towntr")
for town in townArr:
townName=town.find_all("td")[1].text
name="""%s%s%s%s%s%s%s""" % (proName,',',cityName,',', disName,',',townName)
print (name)
f.write(name)
f.write("\n")
if __name__ == "__main__":
with open("location.txt", 'wt', encoding='UTF-8') as f:
getPro(f)
...
...