python爬取统计数据_python爬取国家统计局2019年行政区划分数据mssql

#!/usr/bin/python#-*- coding: UTF-8 -*-#功能: 获取省市县数据#版本:v1.1

importimportlibimportsysimportpymssql

importlib.reload(sys)importrequestsimportlxml.etree as etreeimportosclasschinese_city():#初始化函数

def __init__(self):

self.trdic={1: ‘//tr[@class="provincetr"]‘,2: ‘//tr[@class="citytr"]‘,3: ‘//tr[@class="countytr"]‘,4: ‘//tr[@class="towntr"]‘,5: ‘//tr[@class="villagetr"]‘}defcrawl_page(self,url):try:

headers={‘user-agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0‘,‘Accept‘:‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘}

html= requests.get(url, headers=headers, timeout=1000)

html.encoding= ‘gbk‘

print(str(html.status_code)+url)

text=html.textreturntextexceptrequests.exceptions.RequestException:print(‘超时‘+url)#根据trid 解析子页

defparse(self, trid, pid, url):if url.strip() == ‘‘:returnNoneelse:

html=self.crawl_page(url)

tree= etree.HTML(html, parser=etree.HTMLParser(encoding=‘gbk‘))if trid==3:

nodes=tree.xpath(self.trdic.get(trid))if len(nodes)==0:

nodes= tree.xpath(self.trdic.get(4))print(‘有镇的市:‘+url)else:

nodes=tree.xpath(self.trdic.get(trid))

path=os.path.basename(url)

base_url= url.replace(path, ‘‘)

values=[]for node innodes:

nexturl= node.xpath(‘./td[1]/a/@href‘)if(len(nexturl)>0):

code= node.xpath(‘./td[1]/a/text()‘)if len(code) ==0:

code= node.xpath(‘./td[1]/text()‘)

name= node.xpath(‘./td[2]/a/text()‘)if len(name) ==0:

name= node.xpath(‘./td[2]/text()‘)

value={}

value[‘url‘] = base_url + "".join(nexturl)

value[‘code‘] = "".join(code)

value[‘name‘] = "".join(name)

value[‘pcode‘] =pid

value[‘grade‘] =trid

value[‘cat‘] =0

values.append(value)returnvalues#解析社区页

defparseVillager(self, trid, pid, url):if url.strip() == ‘‘:returnNoneelse:

html=self.crawl_page(url)

tree= etree.HTML(html, parser=etree.HTMLParser(encoding=‘gbk‘))

nodes=tree.xpath(self.trdic.get(trid))

values=[]for node innodes:

code= node.xpath(‘./td[1]/text()‘)

cate= node.xpath(‘./td[2]/text()‘)

name= node.xpath(‘./td[3]/text()‘)

value={}

value[‘code‘] = "".join(code)

value[‘name‘] = "".join(name)

value[‘pcode‘] =pid

value[‘grade‘] =trid

value[‘cat‘] =cate

values.append(value)returnvalues#插入数据库

definsert_to_db(self, taobao):

conn=pymssql.connect(

host=r‘127.0.0.1‘,

user=r‘sa‘,

password=‘123‘,

database=‘areadb‘)

cursor=conn.cursor()try:

param=[]for p intaobao:

param.append((p.get("code"), p.get("grade"), p.get("pcode"), p.get("name").strip(),p.get("cat")))

sql= ‘INSERT INTO fetch_area VALUES(%s,%d,%s,%s,%d)‘cursor.executemany(sql, param)

conn.commit()exceptException as e:

conn.rollback()print(e)finally:ifcursor:

cursor.close()ifconn:

conn.close()#从头执行解析

defparseChineseCity(self):

citys= self.parse(2, ‘520000000000‘, ‘http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/52.html‘)if not citys isNone:

self.insert_to_db(citys)for city incitys:

countys= self.parse(3, city[‘code‘], city[‘url‘])if not countys isNone:

self.insert_to_db(countys)for county incountys:

towns= self.parse(4, county[‘code‘], county[‘url‘])if not towns isNone:

self.insert_to_db(towns)for town intowns:

villagers= self.parseVillager(5, town[‘code‘], town[‘url‘])if not villagers isNone:

self.insert_to_db(villagers)if __name__ == ‘__main__‘:

chinese_city=chinese_city()

chinese_city.parseChineseCity()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值