爬取【京客隆超市】店铺信息

#爬取【京客隆超市】店铺信息

  1. 导入所需库
import requests
import pandas as pd
from lxml import etree
  1. 爬取各区链接
url = 'http://www.jkl.com.cn/cn/shop.aspx'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'}
response = requests.get(url,headers=headers).text
html = etree.HTML(response)
city_name = html.xpath('//div[@class="infoLis"]//a/text()')
city_name = [i.strip() for i in city_name]
city_url = html.xpath('//div[@class="infoLis"]//a/@href')
city_url = ['http://www.jkl.com.cn/cn/' + i  for i in city_url]

3.当只存在一个大区需要翻页时

for i in city_url:
    if i == 'http://www.jkl.com.cn/cn/shopLis.aspx?id=865':
        for a in range(1,4):

            date = {
                '__EVENTTARGET': 'AspNetPager1',
                '__EVENTARGUMENT': a
            }
            response3 = requests.post(url = i, data=date,headers=headers).text
            html2 = etree.HTML(response3)
            city_shop_name = html2.xpath('//span[@class="con01"]/text()')   
            city_shop_dis = html2.xpath('//span[@class="con02"]/text()')
            city_shop_phone = html2.xpath('//span[@class="con03"]/text()')
            city_shop_time = html2.xpath('//span[@class="con04"]/text()')
            shop_name = [d.strip()  for d in city_shop_name]
            print(shop_name)
            print('*'*30)
            date = pd.DataFrame({"店铺名称":shop_name,"店铺地址":city_shop_dis,"联系方式":city_shop_phone,"营业时间":city_shop_time})
            date.to_csv("e:/爬取爬取【京客隆超市】店铺信息.csv",index=False,header=0,mode="a",encoding = "ANSI")
    else:
        response1 =  requests.post(url=i,headers=headers).text

        html1 = etree.HTML(response1)

        city_shop_name1 = html1.xpath('//span[@class="con01"]/text()')

        city_shop_dis1 = html1.xpath('//span[@class="con02"]/text()')
        city_shop_phone1 = html1.xpath('//span[@class="con03"]/text()')
        city_shop_time1 = html1.xpath('//span[@class="con04"]/text()')
        shop_name1 = [c.strip()  for c in city_shop_name1]  
        print(shop_name1)  
    #数据存储
    date = pd.DataFrame({"店铺名称":shop_name1,"店铺地址":city_shop_dis1,"联系方式":city_shop_phone1,"营业时间":city_shop_time1})
    date.to_csv("e:/爬取爬取【京客隆超市】店铺信息.csv",index=False,header=0,mode="a",encoding = "ANSI")

#完成代码

#爬取【京客隆超市】店铺信息
import requests
import pandas as pd
from lxml import etree
url = 'http://www.jkl.com.cn/cn/shop.aspx'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'}
response = requests.get(url,headers=headers).text
html = etree.HTML(response)
city_name = html.xpath('//div[@class="infoLis"]//a/text()')
city_name = [i.strip() for i in city_name]
city_url = html.xpath('//div[@class="infoLis"]//a/@href')
city_url = ['http://www.jkl.com.cn/cn/' + i  for i in city_url]
for i in city_url:
    if i == 'http://www.jkl.com.cn/cn/shopLis.aspx?id=865':
        for a in range(1,4):

            date = {
                '__EVENTTARGET': 'AspNetPager1',
                '__EVENTARGUMENT': a
            }
            response3 = requests.post(url = i, data=date,headers=headers).text
            html2 = etree.HTML(response3)
            city_shop_name = html2.xpath('//span[@class="con01"]/text()')   
            city_shop_dis = html2.xpath('//span[@class="con02"]/text()')
            city_shop_phone = html2.xpath('//span[@class="con03"]/text()')
            city_shop_time = html2.xpath('//span[@class="con04"]/text()')
            shop_name = [d.strip()  for d in city_shop_name]
            print(shop_name)
            print('*'*30)
            date = pd.DataFrame({"店铺名称":shop_name,"店铺地址":city_shop_dis,"联系方式":city_shop_phone,"营业时间":city_shop_time})
            date.to_csv("e:/爬取【京客隆超市】店铺信息.csv",index=False,header=0,mode="a",encoding = "ANSI")
    else:
        response1 =  requests.post(url=i,headers=headers).text

        html1 = etree.HTML(response1)

        city_shop_name1 = html1.xpath('//span[@class="con01"]/text()')

        city_shop_dis1 = html1.xpath('//span[@class="con02"]/text()')
        city_shop_phone1 = html1.xpath('//span[@class="con03"]/text()')
        city_shop_time1 = html1.xpath('//span[@class="con04"]/text()')
        shop_name1 = [c.strip()  for c in city_shop_name1]  
        print(shop_name1)  
    #数据存储
    date = pd.DataFrame({"店铺名称":shop_name1,"店铺地址":city_shop_dis1,"联系方式":city_shop_phone1,"营业时间":city_shop_time1})
    date.to_csv("e:/爬取【京客隆超市】店铺信息.csv",index=False,header=0,mode="a",encoding = "ANSI")
#如果区域内店铺不止一页,且只有一页时



        
    
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值