Python 3爬虫获取华润漆的分店信息_如何抓到华润phpsessid-CSDN博客

本文链接：https://blog.csdn.net/weixin_30777913/article/details/86291428
#! python3
# coding:utf-8
import urllib
from urllib import request
from urllib.request import urlopen
import json
import requests
from time import sleep

#爬取省名的header
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
    ,'Accept':'text/html, application/xhtml+xml, */*','Connection':'Keep-Alive','DNT':'1'
    ,'Accept-Language':'zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.6,en;q=0.4,ja;q=0.2','Cookie':'ASP.NET_SessionId=wrc0hppaej4hxoqkhtgng3ds; SC_ANALYTICS_GLOBAL_COOKIE=c14d814822b941e493eccd07f9b51a49|False; Hm_lvt_86fab548a9cd225ca6996b8f596fb011=1525699410; Hm_lpvt_86fab548a9cd225ca6996b8f596fb011=1525699410; nTalk_CACHE_DATA={uid:lb_1000_ISME9754_guest4B85A27C-78B2-2C,tid:1525699406108016}; NTKF_T2D_CLIENTID=guest4B85A27C-78B2-2C72-684D-3AC577F644C7; _ga=GA1.3.1823818086.1525699427; _gid=GA1.3.1721887409.1525699427; _gat=1'
}

#爬取地区和市县的header
area_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
    ,'Accept':'application/json, text/javascript, */*; q=0.01','Connection':'Keep-Alive','DNT':'1'
    ,'Accept-Encoding': 'gzip, deflate','X-Requested-With': 'XMLHttpRequest'
    ,'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
    ,'Accept-Language': 'zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.6,en;q=0.4,ja;q=0.2','Cookie':'PHPSESSID=9d47ec7b3084bb4dd988070d05dd1954; SERVERID=dee5d74d516b027440f035efd0ad1d40|1525866565|1525866537; _smt_uid=5af2e02d.f9f56a0'
}

#结果文件保存路径
directory = "C:\\Temp\\"

#省市县的链接
list_url = []

#b'\xe7\x9c\x81\xe5\x90\x8d\t\xe5\x9f\x8e\xe5\xb8\x82\t\xe5\xba\x97\xe5\x90\x8d\t\xe7\x94\xb5\xe8\xaf\x9d\t\xe5\x9c\xb0\xe5\x9d\x80\t\xe6\x8e\x88\xe6\x9d\x83\xe5\x8f\xb7'="省名\t城市\t店名\t电话\t地址\t授权号".encode()
table_head = b'\xe7\x9c\x81\xe5\x90\x8d\t\xe5\x9f\x8e\xe5\xb8\x82\t\xe5\xba\x97\xe5\x90\x8d\t\xe7\x94\xb5\xe8\xaf\x9d\t\xe5\x9c\xb0\xe5\x9d\x80\t\xe6\x8e\x88\xe6\x9d\x83\xe5\x8f\xb7'

#结果列表
content = [table_head.decode() + '\n']

#b'\xe5\x9c\xb0\xe5\x9d\x80\xe5\x85\xb3\xe9\x94\xae\xe5\xad\x97\xe6\x88\x96\xe9\x97\xa8\xe5\xba\x97\xe7\xbc\x96\xe5\x8f\xb7'="地址关键字或门店编号".encode()
search_blank = b'\xe5\x9c\xb0\xe5\x9d\x80\xe5\x85\xb3\xe9\x94\xae\xe5\xad\x97\xe6\x88\x96\xe9\x97\xa8\xe5\xba\x97\xe7\xbc\x96\xe5\x8f\xb7'

def getAllProvinces():
    '''
    获取所有的省市县信息
    '''
    #urllib2.urlopen()函数不支持验证、cookie或者其它HTTP高级功能。要支持这些功能，必须使用build_opener()函数创建自定义Opener对象。
    opener = urllib.request.build_opener(urllib.request.HTTPHandler)
    #安装不同的opener对象作为urlopen()使用的全局opener
    urllib.request.install_opener(opener)
    #如果连接失败，延时5秒，无限重试链接
    success = False
    while(success == False):
        try:
            #发送网络POST请求，获取省名
            req = request.Request("http://www.huarun.com/index.php/huarun/Pro", headers = headers,)
            #得到网络响应
            response = urlopen(req)
        except requests.exceptions.ConnectionError as e:
            sleep(5)
        else:
            success = True
    json_data = response.read().decode('utf-8')
    data = json.loads(json_data)
    list_province = data["msg"]
    print(list_province)
    for province_index in range(len(list_province)):
        payload = {'pro': list_province[province_index]["pro"].encode()}
        #如果连接失败，延时5秒，无限重试链接
        success = False
        while(success == False):
            try:
                #发送网络POST请求，获取城市名
                r = requests.post("http://www.huarun.com/index.php/huarun/Area", data=payload,headers = area_headers)
            except requests.exceptions.ConnectionError as e:
                sleep(5)
            else:
                success = True
        json_data = r.text
        data = json.loads(json_data)
        list_area = data["msg"]
        print(list_area)
        for area_index in range(len(list_area)):
            payload = {'city': list_area[area_index]["area"].encode()}
            #如果连接失败，延时5秒，无限重试链接
            success = False
            while(success == False):
                try:
                    #发送网络POST请求，获取地区/县名
                    r = requests.post("http://www.huarun.com/index.php/huarun/City", data=payload,headers = area_headers)
                except requests.exceptions.ConnectionError as e:
                    sleep(5)
                else:
                    success = True
            json_data = r.text
            data = json.loads(json_data)
            list_city = data["msg"]
            print(list_city)
            for city_index in range(len(list_city)):
                list_url.append({"pro":list_province[province_index]["pro"].encode(),"city":list_city[city_index]["city"].encode(),"store":list_city[city_index]["city"].encode(),"search":search_blank})

    for payload_index in range(len(list_url)):
        payload = list_url[payload_index]
        #如果连接失败，延时5秒，无限重试链接
        success = False
        while(success == False):
            try:
                #发送网络POST请求
                r = requests.post("http://www.huarun.com/index.php/huarun/SearchStore", data=payload,headers = area_headers)
            except requests.exceptions.ConnectionError as e:
                sleep(5)
            else:
                success = True
        json_data = r.text
        data = json.loads(json_data)
        list_search = data["msg"]
        print(list_search)
        #[{'address': '安徽省安庆市光彩大市场三期天柱山路109号-110号', 'auth': '皖00065', 'phone':
        #'18505226929', 'sname': '安庆光彩大市场', 'vip': '0'}, {'address':
        #'安庆市光彩市场四期D区元山路6-8号 ', 'auth': '皖00022', 'phone': '0556-5510557',
        #'sname': '光彩大市场', 'vip': '0'}]
        for search_index in range(len(list_search)):
            address = list_search[search_index]["address"].replace('\n','').replace('\r','')
            auth = list_search[search_index]["auth"]
            phone = list_search[search_index]["phone"].replace('\n','').replace('\r','')
            sname = list_search[search_index]["sname"].replace('\n','').replace('\r','')
            content.append('\t'.join((payload["pro"].decode() , payload["city"].decode() , sname , phone , address , auth)) + "\n")

def outputResultFile():
    '''
    输出为结果文件
    '''
    path_name = directory + '\\fendian_huarun.txt'
    with open(path_name, 'w', encoding='utf-8') as file:
        #循环遍历列表content
        for index in range(len(content)):
            file.write(content[index])

def start_crawl():
    '''
    爬虫主程序
    '''
    #获取所有的省市县信息
    getAllProvinces()
    #输出为结果文件
    outputResultFile()
    #完成
    print('Complete!')

#爬虫主程序
start_crawl()