Python爬虫

Downloading.py

def download(city, county, town, village, num):  # , num
    path = r'/Users/chengxu/PycharmProjects/AdcodeSpider/result/tjyqhdmhcxhfdm2021_' + num + '.txt'
    dic = {**city, **county, **town, **village}  # 字典合并
    try:
        for i in dic.values():
            with open(path, 'a', encoding='utf-8') as f:
                f.write('"' + i['qhdm'] + '","' + i['name'] + '","' + i['cxfldm'] + '"' + '\n')
        print(num + " write finished!")
    except Exception as e:
        print('write file failed.....')
        print(e)

Scheduler.py

from Spider02 import Spiders
from Spider02 import Downloading
from multiprocessing import Pool

# 目标列表
aimurl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/"
aimurllist = ["11", "12", "13", "14", "15", "21", "22", "23", "31", "32", "33", "34", "35", "36", "37",
#               "41", "42", "43", "44", "45", "46", "50", "51", "52", "53", "54", "61", "62", "63", "64", "65"]
# aimurllist = ["11", "12", "13", "14", "15", "21", "22"]


def run_proc(url, num):
    try:
        print(num + ' is running')
        (city, county, town, village) = Spiders.spider(url, num)
        Downloading.download(city, county, town, village, num)  #
        print(num + ' ended')
    except Exception as e:
        print('run_proc failed.....')
        print(e)


if __name__ == "__main__":
    # p = Pool(1)
    for i in aimurllist:
        # p.apply_async(run_proc, args=(aimurl, i))
        print(i + " 开始爬取......")
        run_proc(aimurl, i)
    # print('Waiting for all subprocesses done ...')
    # p.close()  # 关闭进程池
    # p.join()  # 等待开辟的所有进程执行完后,主进程才继续往下执行
    print('All subprocesses done')
executor = ThreadPoolExecutor(max_workers=10)
all_task = [executor.submit(run_proc, aimurl, url) for url in aimurllist]
wait(all_task, return_when=ALL_COMPLETED)

Spiders.py

import requests
from bs4 import BeautifulSoup
import random
import time

# 选择随机headers,降低被反爬虫的可能性
ua_list = [
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
    "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]


# 采用断点续尝试请求的方式提高爬虫成功率 经过测试网络正常一般最多retry一次就能获得结果
def getsoup(url, num_retries=6):
    user_agent = random.choice(ua_list)
    headers = {"User-Agent": user_agent}
    try:
        res = requests.get(url, headers=headers)  # 以get方法访问目标网址获取网页信息
        # res.encoding = 'ISO-8859-1'  # 该网页是以gbk的编码形式显示的
        soup = BeautifulSoup(res.text.encode('ISO-8859-1').decode('utf8'), 'html.parser')  # 使用美丽汤解析网页内容
        return soup
    except Exception as e:
        if num_retries > 0:
            time.sleep(10)
            print(url)
            print('requests fail,retry the last th' + str(num_retries) + '  ' + time.ctime())
            return getsoup(url, num_retries - 1)
        else:
            print("retry fail!")
            print("error: %s" % e + "   " + url)
            return  # 返回空值 程序运行报错停止


# 获取市级代码
def getsecond(url, num):
    city = {}
    try:
        soup = getsoup(url + num + '.html')
        for j in soup.select('.citytr'):
            id = str(j.select('td')[0].text)  # 130100000000
            city[id[0:4]] = {'qhdm': id, 'name': j.select('td')[1].text, 'cxfldm': '0'}
        return city
    except Exception as e:
        print('getsecond---异常' + url + num + '.html')
        print(e)


# 获取区县级代码
def getthird(url, lists):
    county = {}
    try:
        for i in lists:
            soup = getsoup(url + i[0:2] + '/' + i + '.html')
            for j in soup.select('.countytr'):
                id = str(j.select('td')[0].text)  # 130201000000
                county[id[0:6]] = {'qhdm': id, 'name': j.select('td')[1].text, 'cxfldm': '0'}
        return county
    except Exception as e:
        print('getthird---异常' + url + i[0:2] + '/' + i + '.html')
        print(e)


# 获取镇级代码 市辖区没有下级代码
def getfourth(url, lists):
    town = {}
    try:
        for i in lists:
            # print(url + i[0:2] + '/' + i[2:4] + '/' + i + '.html')
            soup = getsoup(url + i[0:2] + '/' + i[2:4] + '/' + i + '.html')
            for j in soup.select('.towntr'):
                id = str(j.select('td')[0].text)  # 130202001000
                town[id[0:9]] = {'qhdm': id, 'name': j.select('td')[1].text, 'cxfldm': '0'}  # 130202001
        return town
    except Exception as e:
        print('getfourth---异常' + url + i[0:2] + '/' + i[2:4] + '/' + i + '.html')
        print(e)


# 获取村级代码
def getfifth(url, lists):
    village = {}
    try:
        for i in lists:
            # print(url + i[0:2] + '/' + i[2:4] + '/' + i[4:6] + '/' + i + '.html')
            soup = getsoup(url + i[0:2] + '/' + i[2:4] + '/' + i[4:6] + '/' + i + '.html')
            for j in soup.select('.villagetr'):
                id = str(j.select('td')[0].text)  # 110101001001
                village[id[0:12]] = {'qhdm': id, 'name': j.select('td')[2].text,
                                     'cxfldm': j.select('td')[1].text}  # 110101001001
        return village
    except Exception as e:
        print('getfifth---异常' + url + i[0:2] + '/' + i[2:4] + '/' + i[4:6] + '/' + i + '.html')
        print(e)


def spider(aimurl, num):
    city = getsecond(aimurl, num)
    print(num + ' city finished!')
    county = getthird(aimurl, city)
    print(num + ' county finished!')
    town = getfourth(aimurl, county)
    print(num + ' town finished!')
    village = getfifth(aimurl, town)
    print(num + ' village finished!')
    print(num + " crawl finished!Now,writing into txt...")
    return city, county, town, village

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值