项目需要全国省市县数据,网上找了一圈发现要么过时要么收费,于是花点时间自己写了个爬虫爬了些基础数据,基本上够用了,数据是从国家统计局爬来的,目前更新到2019年,代码如下:
import requests
from requests.adapters import HTTPAdapter
from bs4 import BeautifulSoup
import re
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/67.0.3396.99 Safari/537.36'}
def get_page(url):
try:
s = requests.Session()
s.mount('http://', HTTPAdapter(max_retries=2)) # 重试次数
r = s.get(url, headers=headers, timeout=3)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except Exception as e:
print(e)
return ''
# 解析省级数据,返回(省份链接,id, 名字)
def parse_province(page):
provinces = []
id_pattern = r'(.*).html' # 用于提取链接中的id信息
soup = BeautifulSoup(page, 'lxml')
province_trs = soup.find_all(class_='provincetr')
# 有些空标签,所以需要很多判断
for tr in province_trs:
if tr:
province_items = tr.find_all(name='td')