Python 3爬虫获取嘉宝莉漆的分店信息_safedog-flow-item爬虫-CSDN博客

本文链接：https://blog.csdn.net/weixin_30777913/article/details/86291452
#! python3
# coding:utf-8
import urllib
import requests
from bs4 import BeautifulSoup
from time import sleep

#爬取搜索结果页面的header
result_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
    ,'Accept':'text/html, application/xhtml+xml, */*','Connection':'Keep-Alive','DNT':'1'
    ,'Accept-Encoding': 'gzip, deflate','X-Requested-With': 'XMLHttpRequest'
    ,'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
    ,'Accept-Language': 'zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.6,en;q=0.4,ja;q=0.2','Cookie':'ASP.NET_SessionId=mwwuspudqdxsm354g5xsqugd; CNZZDATA5617436=cnzz_eid%3D1133206276-1525702343-%26ntime%3D1525702343; Hm_lvt_f0a8192e2505f1dedce950f8913237d6=1525702344; Hm_lpvt_f0a8192e2505f1dedce950f8913237d6=1525702344; bdshare_firstime=1525702344314; Hm_lvt_fc0f20ff478c08368cdbb875241c3fe8=1525702344; Hm_lpvt_fc0f20ff478c08368cdbb875241c3fe8=1525702344; safedog-flow-item=8328FEE3689468599F693C625BEDF947; UM_distinctid=1633af23bc1b-02a69eb9c5ba15-38401c1f-e82f7-1633af23bc2159'
}

#结果文件保存路径
directory = "C:\\Temp\\"

#b'\xe5\xba\x97\xe9\x93\xba\xe5\x90\x8d\xe7\xa7\xb0\t\xe5\x9c\xb0\xe5\x9d\x80'="店铺名称\t地址".encode()
table_head = b'\xe5\xba\x97\xe9\x93\xba\xe5\x90\x8d\xe7\xa7\xb0\t\xe5\x9c\xb0\xe5\x9d\x80'

#结果列表
content = [table_head.decode() + '\n']

def getShops(list_shop):
    '''
    获取单个页面的所有商铺
    '''
    for shop_index in range(len(list_shop)):
        list_dt = list_shop[shop_index].find_all("dt")
        list_p = list_shop[shop_index].find_all("p")
        shop = list_dt[0].text
        address = list_p[0].text
        content.append('\t'.join((shop , address)) + '\n')

def getAllShops():
    '''
    获取所有的店铺信息
    '''
    #urllib2.urlopen()函数不支持验证、cookie或者其它HTTP高级功能。要支持这些功能，必须使用build_opener()函数创建自定义Opener对象。
    opener = urllib.request.build_opener(urllib.request.HTTPHandler)
    #安装不同的opener对象作为urlopen()使用的全局opener
    urllib.request.install_opener(opener)
    #如果连接失败，延时5秒，无限重试链接
    success = False
    while(success == False):
        try:
            url = "http://www.carpoly.com.cn/Decorative/Service/Shop_list.aspx"
            print(url)
            #发送网络GET请求，获取商铺信息
            r = requests.get(url, headers = result_headers)
        except requests.exceptions.ConnectionError as e:
            sleep(5)
        else:
            success = True
    #得到网络响应
    html = r.text
    bsObj = BeautifulSoup(html,"html.parser")
    list_link = bsObj.find("div","page").find_all("span")
    pages = int(list_link[len(list_link) - 2].text)
    list_shop = bsObj.find("div","xianshi_dz").find_all("dl")
    getShops(list_shop)
    #得到分页的商铺信息
    for page_index in range(2,pages + 1):
        url = "http://www.carpoly.com.cn/Decorative/Service/Shop_list.aspx?currentPage=" + str(page_index) + "&pageSize=10"
        print(url)
        #如果连接失败，延时5秒，无限重试链接
        success = False
        while(success == False):
            try:
                #发送网络GET请求，获取商铺信息
                r = requests.get(url, headers = result_headers)
            except requests.exceptions.ConnectionError as e:
                sleep(5)
            else:
                success = True
        html_data = r.text
        ship_bsObj = BeautifulSoup(html_data,"html.parser") 
        list_shop = bsObj.find("div","xianshi_dz").find_all("dl") 
        getShops(list_shop)

def outputResultFile():
    '''
    输出为结果文件
    '''
    path_name = directory + '\\fendian_jiabaoli.txt'
    with open(path_name, 'w', encoding='utf-8') as file:
        #循环遍历列表content
        for index in range(len(content)):
            file.write(content[index])

def start_crawl():
    '''
    爬虫主程序
    '''
    #获取所有的店铺信息
    getAllShops()
    #输出为结果文件
    outputResultFile()
    #完成
    print('Complete!')

#爬虫主程序
start_crawl()