Python 3爬虫获取多乐士漆的分店信息-CSDN博客

本文链接：https://blog.csdn.net/weixin_30777913/article/details/86291476
#! python3
# coding:utf-8
import urllib
import json
from urllib.parse import quote
import requests
from time import sleep

#爬取Ajax Handler的header
main_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
    ,'DNT':'1'
    ,'Accept-Encoding': 'gzip, deflate'
    ,'Accept':'application/json, text/javascript, */*; q=0.01','X-Requested-With': 'XMLHttpRequest'
    ,'Accept-Language':'zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.6,en;q=0.4,ja;q=0.2','Cookie':'uid=CvwBTFrzEOOuGgVmEMKMAg==; has_js=1; __clickidc=152587901641657657; _ga=GA1.3.1733272225.1525879020; _gid=GA1.3.45889085.1525879020; _gat_UA-69298406-2=1; a=xpQGf0ZMu925; Drupal.session_cache.sid=yEU9GqkgJAl7'
}

#结果文件保存路径
directory = "C:\\Temp\\"

#省市县的链接
list_url = []

#b'\xe7\x9c\x81\xe5\xb8\x82/\xe5\x8c\xba\xe5\x9f\x9f\t\xe5\x9f\x8e\xe5\xb8\x82\t\xe5\x95\x86\xe9\x93\xba\xe5\x90\x8d\xe7\xa7\xb0\t\xe5\x9c\xb0\xe5\x9d\x80\t\xe7\x94\xb5\xe8\xaf\x9d\t\xe9\x82\xae\xe7\xbc\x96\t\xe7\xbb\x8f\xe7\xba\xac\xe5\xba\xa6\t\xe6\x8e\x88\xe6\x9d\x83\xe5\x8f\xb7\t\xe9\x93\xbe\xe6\x8e\xa5'="省市/区域\t城市\t商铺名称\t地址\t电话\t邮编\t经纬度\t授权号\t链接".encode()
table_head = b'\xe7\x9c\x81\xe5\xb8\x82/\xe5\x8c\xba\xe5\x9f\x9f\t\xe5\x9f\x8e\xe5\xb8\x82\t\xe5\x95\x86\xe9\x93\xba\xe5\x90\x8d\xe7\xa7\xb0\t\xe5\x9c\xb0\xe5\x9d\x80\t\xe7\x94\xb5\xe8\xaf\x9d\t\xe9\x82\xae\xe7\xbc\x96\t\xe7\xbb\x8f\xe7\xba\xac\xe5\xba\xa6\t\xe6\x8e\x88\xe6\x9d\x83\xe5\x8f\xb7\t\xe9\x93\xbe\xe6\x8e\xa5'

#结果列表
content = [table_head.decode() + '\n']

#b'\xe4\xb8\xad\xe5\x9b\xbd'="中国".encode()
china = b'\xe4\xb8\xad\xe5\x9b\xbd'

def getAllProvinces():
    '''
    获取所有的省市县信息
    '''
    #urllib2.urlopen()函数不支持验证、cookie或者其它HTTP高级功能。要支持这些功能，必须使用build_opener()函数创建自定义Opener对象。
    opener = urllib.request.build_opener(urllib.request.HTTPHandler)
    #安装不同的opener对象作为urlopen()使用的全局opener
    urllib.request.install_opener(opener)
    #分页数
    pagenum = 0
    #HTTP POST请求的数据
    payload = {'address':quote(china.decode()),'attributes':'','language':'zh','pagenum':pagenum,'curlat':39.904989,'curlng':116.405285}
    #如果连接失败，延时5秒，无限重试链接
    success = False
    while(success == False):
        try:
            #发送网络请求
            r = requests.post("https://www.dulux.com.cn/find/store-ajax", data=payload,headers = main_headers)
        except requests.exceptions.ConnectionError as e:
            sleep(5)
        else:
            success = True
    #得到网站响应并返回JSON字符串
    json_data = r.text
    #解析JSON字符串为字典对象
    data = json.loads(json_data)
    #分页数加1
    pagenum+=1
    while len(data) > 0:
        #HTTP POST请求的数据
        payload = {'address':quote(china.decode()),'attributes':'','language':'zh','pagenum':pagenum,'curlat':39.904989,'curlng':116.405285}
        #如果连接失败，延时5秒，无限重试链接
        success = False
        while(success == False):
            try:
                #发送网络请求
                r = requests.post("https://www.dulux.com.cn/find/store-ajax", data=payload,headers = main_headers)
            except requests.exceptions.ConnectionError as e:
                sleep(5)
            else:
                success = True
        #得到网站响应并返回JSON字符串
        json_data = r.text
        #解析JSON字符串为字典对象
        data = json.loads(json_data)
        for data_index in range(len(data)):
            item_data = data[data_index]
            region = item_data['region']
            city = item_data['city']
            companyname = item_data['companyname']
            address = item_data['address']
            phone = item_data['phone']
            postalcode = item_data['postalcode']
            lat = item_data['lat']
            lon = item_data['lon']
            subtitle = item_data['subtitle'] #授权号
            websiteurl = item_data['websiteurl']
            #storetext=item_data['storetext']
            #attributecodes=item_data['attributecodes']
            content.append('\t'.join((region , city , companyname , address , phone , postalcode , lon + ',' + lat , subtitle , websiteurl)) + '\n')
        print(pagenum)
        #分页数加1
        pagenum+=1

def outputResultFile():
    '''
    输出为结果文件
    '''
    path_name = directory + '\\fendian_duoleshi.txt'
    with open(path_name, 'w', encoding='utf-8') as file:
        #循环遍历列表content
        for index in range(len(content)):
            file.write(content[index])

def start_crawl():
    '''
    爬虫主程序
    '''
    #获取所有的省市县信息
    getAllProvinces()
    #输出为结果文件
    outputResultFile()
    #完成
    print('Complete!')

#爬虫主程序
start_crawl()