现在奉上多进程版本
from multiprocessing.pool import Pool
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time
import os
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/83.0.4103.116 Safari/537.36'}
# 获取一级代码、名称、下一级链接
def getOneLevelCodeName(originUrl='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'):
web = requests.get(originUrl, headers=headers) # 获取网页
web.encoding = web.apparent_encoding # 设置编码
soup = BeautifulSoup(web.text, 'html.parser') # 解析网页
provinceList = soup.select('.provincetr') # 查找类名为provincetr的内容
oneLevelWeb = []
for table in provinceList:
for province in table.select('a'):
oneLevelWeb.append((province['href'], province.te