#!/usr/bin/python#-*- coding: UTF-8 -*-#功能: 获取省市县数据#版本:v1.1
importimportlibimportsysimportpymssql
importlib.reload(sys)importrequestsimportlxml.etree as etreeimportosclasschinese_city():#初始化函数
def __init__(self):
self.trdic={1: ‘//tr[@class="provincetr"]‘,2: ‘//tr[@class="citytr"]‘,3: ‘//tr[@class="countytr"]‘,4: ‘//tr[@class="towntr"]‘,5: ‘//tr[@class="villagetr"]‘}defcrawl_page(self,url):try:
headers={‘user-agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0‘,‘Accept‘:‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘}
html= requests.get(url, headers=headers, timeout=1000)
html.encoding= ‘gbk‘
print(str(html.status_code)+url)
text=html.textreturntextexceptrequests.exceptions.RequestException:print(‘超时‘+url)#根据trid 解析子页
defparse(self, trid, pid, url):if url.strip() == ‘‘:returnNoneelse:
html=self.crawl_page(url)
tree= etree.HTML(html, parser=etree.HTMLParser(encoding=‘gbk‘))if trid==3:
nodes=tree.xpath(self.trdic.get(trid))if len(nodes)==0:
nodes= tree.xpath(self.trdic.get(4))print(‘有镇的市:‘+url)else:
nodes=tree.xpath(self.trdic.get(trid))
path=os.path.basename(url)
base_url= url.replace(path, ‘‘)
values=[]for node innodes:
nexturl= node.xpath(‘./td[1]/a/@href‘)if(len(nexturl)>0):
code= node.xpath(‘./td[1]/a/text()‘)if len(code) ==0:
code= node.xpath(‘./td[1]/text()‘)
name= node.xpath(‘./td[2]/a/text()‘)if len(name) ==0:
name= node.xpath(‘./td[2]/text()‘)
value={}
value[‘url‘] = base_url + "".join(nexturl)
value[‘code‘] = "".join(code)
value[‘name‘] = "".join(name)
value[‘pcode‘] =pid
value[‘grade‘] =trid
value[‘cat‘] =0
values.append(value)returnvalues#解析社区页
defparseVillager(self, trid, pid, url):if url.strip() == ‘‘:returnNoneelse:
html=self.crawl_page(url)
tree= etree.HTML(html, parser=etree.HTMLParser(encoding=‘gbk‘))
nodes=tree.xpath(self.trdic.get(trid))
values=[]for node innodes:
code= node.xpath(‘./td[1]/text()‘)
cate= node.xpath(‘./td[2]/text()‘)
name= node.xpath(‘./td[3]/text()‘)
value={}
value[‘code‘] = "".join(code)
value[‘name‘] = "".join(name)
value[‘pcode‘] =pid
value[‘grade‘] =trid
value[‘cat‘] =cate
values.append(value)returnvalues#插入数据库
definsert_to_db(self, taobao):
conn=pymssql.connect(
host=r‘127.0.0.1‘,
user=r‘sa‘,
password=‘123‘,
database=‘areadb‘)
cursor=conn.cursor()try:
param=[]for p intaobao:
param.append((p.get("code"), p.get("grade"), p.get("pcode"), p.get("name").strip(),p.get("cat")))
sql= ‘INSERT INTO fetch_area VALUES(%s,%d,%s,%s,%d)‘cursor.executemany(sql, param)
conn.commit()exceptException as e:
conn.rollback()print(e)finally:ifcursor:
cursor.close()ifconn:
conn.close()#从头执行解析
defparseChineseCity(self):
citys= self.parse(2, ‘520000000000‘, ‘http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/52.html‘)if not citys isNone:
self.insert_to_db(citys)for city incitys:
countys= self.parse(3, city[‘code‘], city[‘url‘])if not countys isNone:
self.insert_to_db(countys)for county incountys:
towns= self.parse(4, county[‘code‘], county[‘url‘])if not towns isNone:
self.insert_to_db(towns)for town intowns:
villagers= self.parseVillager(5, town[‘code‘], town[‘url‘])if not villagers isNone:
self.insert_to_db(villagers)if __name__ == ‘__main__‘:
chinese_city=chinese_city()
chinese_city.parseChineseCity()