python 取城乡划分代码(省市县镇乡村5级)

  1 # coding:utf-8
  2 #!/bin/env python
  3 import requests,re,time
  4 from urllib import parse
  5 from bs4 import BeautifulSoup
  6 import bs4
  7 
  8 header = {
  9     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
 10     "Accept-Encoding": "gzip, deflate",
 11     "Accept-Language": "zh-CN,zh;q=0.9",
 12     "Connection": "keep-alive",
 13     "Host": "www.stats.gov.cn",
 14     "Referer": "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html",
 15     "Upgrade-Insecure-Requests": "1",
 16     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
 17 }
 18 url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html'
 19 def gethtml(url_str='data'):
 20     try:
 21         html_s = req.get(url_str)
 22         if html_s.status_code == 200:
 23             # print('抓取成功网页长度:', len(html_s.text))
 24             test_html = html_s.text.encode("latin1").decode("gbk")
 25             while re.findall('charset=(.*?)"',test_html) != ['gb2312']:
 26                 print('进入while')
 27                 html_s = req.get(url_str)
 28                 test_html = html_s.text.encode("latin1").decode("gbk")
 29             return test_html
 30     except BaseException as e:
 31 
 32             print('抓取出现错误:',e)
 33 def writetxt(str,str_type):
 34     if str_type == 'url':
 35         file_str = 'D:\curl.txt'
 36     else:
 37         file_str = 'D:\city.txt'
 38     with open(file_str, 'a+') as s_f:
 39         s_f.writelines(str + '\n')
 40 def province(url_str):
 41     if url_str != '':
 42         soup_html = re.findall("</td></tr>\r\n(.*)\r\n</table>", gethtml(url_str), re.S)
 43         soup_html = soup_html[0] if soup_html != [] else ''
 44         soup = BeautifulSoup(soup_html, 'lxml')
 45         for soup_tr in soup.findAll('tr', class_='provincetr'):
 46             for soup_td in soup_tr.find_all(name='a'):
 47                 soup_sid = soup_td['href'].split('.')[0]
 48                 soup_txt = soup_td.get_text()
 49                 soup_url = parse.urljoin(url_str, soup_td['href'])
 50                 print('level_1', ['0', soup_sid, soup_txt, soup_url])
 51                 writetxt(str(['level_1', '0', soup_sid, soup_txt, soup_url]), 'data')
 52                 writetxt(soup_url, 'url')
 53 def getcity(url_str):
 54     if url_str != '':
 55         soup_html = re.findall("</td></tr>\r\n(.*)\r\n</table>", gethtml(url_str), re.S)
 56         soup_html = soup_html[0] if soup_html != [] else ''
 57         soup = BeautifulSoup(soup_html, 'lxml')
 58         Parent_url = re.findall("(\d+).html", url_str)
 59         Parent_url = Parent_url[0] if Parent_url != [] else ''
 60         level = str(int(len(Parent_url)/2+1))
 61         class_str = {'2': 'citytr', '3': 'countytr' ,'4': 'towntr' ,'5': 'villagetr'}
 62         for soup_tr in soup.findAll('tr', class_=class_str[level]):
 63             soup_sid = re.findall(r'\d+', soup_tr.get_text())
 64             soup_sid = soup_sid[0] if soup_sid != [] else ''
 65             soup_txt = re.findall(r'\D+', soup_tr.get_text())
 66             soup_txt = soup_txt[0] if soup_txt != [] else ''
 67             soup_url = re.findall('href="(.*?)">', str(soup_tr))
 68             soup_url = parse.urljoin(url_str, soup_url[0]) if soup_url != [] else ''
 69             print('level_'+level, [Parent_url, soup_sid, soup_txt, soup_url])
 70             writetxt(str(['level_'+level, Parent_url, soup_sid, soup_txt, soup_url]), 'data')
 71             writetxt(soup_url, 'url')
 72 def updateurl():
 73     file_str = 'D:\curl.txt'
 74     with open(file_str, 'r') as f:
 75         lines = f.readlines()
 76         with open(file_str, 'w+') as f_w:
 77             if lines != []:
 78                 lines[0] = ''
 79                 f_w.writelines(lines)
 80 def geturl():
 81     file_str = 'D:\curl.txt'
 82     lines_str = ''
 83     with open(file_str, 'r') as f:
 84         lines = f.readlines()
 85         if lines !=[]:
 86             lines_str = lines[0].strip()
 87     return lines_str
 88 
 89 req = requests.session()
 90 req.headers = header
 91 province(url)
 92 
 93 while 1 != 2:
 94     try:
 95         current_url = geturl()
 96         getcity(current_url)
 97         updateurl()
 98     except BaseException as e:
 99         print(e)
100         time.sleep(1)

 

转载于:https://www.cnblogs.com/Magicn/p/10214962.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值