Python 3 爬虫获取三棵树漆的分店信息

最新推荐文章于 2023-10-12 22:30:00 发布

weixin_30777913

最新推荐文章于 2023-10-12 22:30:00 发布

阅读量118

点赞数

分类专栏： Python 文章标签： Python

本文链接：https://blog.csdn.net/weixin_30777913/article/details/86291380

版权

Python 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

#! python3
# coding:utf-8
import urllib
import requests
from bs4 import BeautifulSoup
from time import sleep

#爬取搜索结果页面的header
result_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
    ,'Accept':'text/html, application/xhtml+xml, */*','Connection':'Keep-Alive','DNT':'1'
    ,'Accept-Encoding': 'gzip, deflate','X-Requested-With': 'XMLHttpRequest'
    ,'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
    ,'Accept-Language': 'zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.6,en;q=0.4,ja;q=0.2','Cookie':'PHPSESSID=f1hhe1n10b735d59vcomgdql91; Hm_lvt_af55d26623d35566407534f69e434677=1525910281; Hm_lpvt_af55d26623d35566407534f69e434677=1525910281; __v3_c_sesslist_11210=f0w1i6y24h_dmk; __v3_c_pv_11210=1; __v3_c_session_11210=1525910280117569; __v3_c_today_11210=1; __v3_c_review_11210=0; __v3_c_last_11210=1525910280117; __v3_c_visitor=1525910280117569; __v3_c_session_at_11210=1525910292142'
}

#结果文件保存路径
directory = "C:\\Temp\\"
#结果列表
content = []

def getShops(list_shop):
    '''
    获取单个页面的所有商铺
    '''
    for shop_index in range(len(list_shop)):
        list_td = list_shop[shop_index].find_all("td")
        shop = list_td[0].text
        province = list_td[1].text
        city = list_td[2].text
        state = list_td[3].text
        address = list_td[4].text
        identification = list_td[5].text
        content.append(province + '\t' + city + '\t' + state + '\t' + shop + '\t' + address + '\t' + identification + '\n')

def getAllShops():
    '''
    获取所有的店铺信息
    '''
    #urllib2.urlopen()函数不支持验证、cookie或者其它HTTP高级功能。要支持这些功能，必须使用build_opener()函数创建自定义Opener对象。
    opener = urllib.request.build_opener(urllib.request.HTTPHandler)
    #安装不同的opener对象作为urlopen()使用的全局opener
urllib.request.install_opener(opener)
    #如果连接失败，延时5秒，无限重试链接
    success = False
    while(success == False):
        try:
            #发送网络请求到三棵树漆的专卖店搜索页面
            r = requests.get("http://www.skshu.com.cn/Agency/result.html?province_id=&city_id=&keyword=", headers = result_headers)
        except requests.exceptions.ConnectionError as e:
            sleep(5)
        else:
            success = True
    #得到网络响应，获取三棵树漆专卖店搜索页面的主页内容
    html = r.text
    bsObj = BeautifulSoup(html,"html.parser")
    pages = int(bsObj.find(href="/Agency/ajax_lists.html").attrs["data-total-page"])
    list_shop = bsObj.find(id="content-list").find_all("tr")
    getShops(list_shop)
    #得到分页的商铺信息
    for page_index in range(2,pages + 1):
        payload = {'p': page_index}
        print(page_index)
        r = requests.post("http://www.skshu.com.cn/Agency/ajax_lists.html", data=payload,headers = result_headers)
        html_data = r.text
        ship_bsObj = BeautifulSoup(html_data,"html.parser") 
        list_shop = ship_bsObj.find_all("tr") 
        getShops(list_shop)

def outputResultFile():
    '''
    输出为结果文件
    '''
    path_name = directory + '\\fendian_sankeshu.txt'
    with open(path_name, 'w', encoding='utf-8') as file:
        #循环遍历列表content
        for index in range(len(content)):
            file.write(content[index])

def start_crawl():
    '''
    爬虫主程序
    '''
    #获取所有的店铺信息
    getAllShops()
    #输出为结果文件
    outputResultFile()
    #完成
    print('Complete!')

#爬虫主程序
start_crawl()

weixin_30777913

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python 3 爬虫获取三棵树漆的分店信息

#! python3# coding:utf-8import urllibimport requestsfrom bs4 import BeautifulSoupfrom time import sleep#爬取搜索结果页面的headerresult_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64;...
复制链接

扫一扫