Python 3 爬虫获取三棵树漆的分店信息

#! python3
# coding:utf-8
import urllib
import requests
from bs4 import BeautifulSoup
from time import sleep

#爬取搜索结果页面的header
result_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
    ,'Accept':'text/html, application/xhtml+xml, */*','Connection':'Keep-Alive','DNT':'1'
    ,'Accept-Encoding': 'gzip, deflate','X-Requested-With': 'XMLHttpRequest'
    ,'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
    ,'Accept-Language': 'zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.6,en;q=0.4,ja;q=0.2','Cookie':'PHPSESSID=f1hhe1n10b735d59vcomgdql91; Hm_lvt_af55d26623d35566407534f69e434677=1525910281; Hm_lpvt_af55d26623d35566407534f69e434677=1525910281; __v3_c_sesslist_11210=f0w1i6y24h_dmk; __v3_c_pv_11210=1; __v3_c_session_11210=1525910280117569; __v3_c_today_11210=1; __v3_c_review_11210=0; __v3_c_last_11210=1525910280117; __v3_c_visitor=1525910280117569; __v3_c_session_at_11210=1525910292142'
}

#结果文件保存路径
directory = "C:\\Temp\\"
#结果列表
content = []

def getShops(list_shop):
    '''
    获取单个页面的所有商铺
    '''
    for shop_index in range(len(list_shop)):
        list_td = list_shop[shop_index].find_all("td")
        shop = list_td[0].text
        province = list_td[1].text
        city = list_td[2].text
        state = list_td[3].text
        address = list_td[4].text
        identification = list_td[5].text
        content.append(province + '\t' + city + '\t' + state + '\t' + shop + '\t' + address + '\t' + identification + '\n')

def getAllShops():
    '''
    获取所有的店铺信息
    '''
    #urllib2.urlopen()函数不支持验证、cookie或者其它HTTP高级功能。要支持这些功能,必须使用build_opener()函数创建自定义Opener对象。
    opener = urllib.request.build_opener(urllib.request.HTTPHandler)
    #安装不同的opener对象作为urlopen()使用的全局opener
urllib.request.install_opener(opener)
    #如果连接失败,延时5秒,无限重试链接
    success = False
    while(success == False):
        try:
            #发送网络请求到三棵树漆的专卖店搜索页面
            r = requests.get("http://www.skshu.com.cn/Agency/result.html?province_id=&city_id=&keyword=", headers = result_headers)
        except requests.exceptions.ConnectionError as e:
            sleep(5)
        else:
            success = True
    #得到网络响应,获取三棵树漆专卖店搜索页面的主页内容
    html = r.text
    bsObj = BeautifulSoup(html,"html.parser")
    pages = int(bsObj.find(href="/Agency/ajax_lists.html").attrs["data-total-page"])
    list_shop = bsObj.find(id="content-list").find_all("tr")
    getShops(list_shop)
    #得到分页的商铺信息
    for page_index in range(2,pages + 1):
        payload = {'p': page_index}
        print(page_index)
        r = requests.post("http://www.skshu.com.cn/Agency/ajax_lists.html", data=payload,headers = result_headers)
        html_data = r.text
        ship_bsObj = BeautifulSoup(html_data,"html.parser") 
        list_shop = ship_bsObj.find_all("tr") 
        getShops(list_shop)

def outputResultFile():
    '''
    输出为结果文件
    '''
    path_name = directory + '\\fendian_sankeshu.txt'
    with open(path_name, 'w', encoding='utf-8') as file:
        #循环遍历列表content
        for index in range(len(content)):
            file.write(content[index])

def start_crawl():
    '''
    爬虫主程序
    '''
    #获取所有的店铺信息
    getAllShops()
    #输出为结果文件
    outputResultFile()
    #完成
    print('Complete!')

#爬虫主程序
start_crawl()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值