爬虫爬取京东部分需要的数据

#_*_coding=utf-8 _*_
#__author__ = 'Administrator'


from selenium import webdriver
from bs4 import BeautifulSoup
import sys
import urllib2
import time
from compiler.ast import flatten
import re
import xlwt
reload(sys)
sys.setdefaultencoding('utf-8')


deiver =webdriver.Firefox()
deiver.get('https://fresh.jd.com/')
deiver.find_element_by_xpath('/html/body/div[4]/div/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div/a').click()#需要手动输入div[1]/div/a')  ,第一个div值
windows = deiver.window_handles
deiver.switch_to.window(windows[-1]) #点击进入商品列表界面
deiver.switch_to.window(windows[0])
deiver.close()
deiver.switch_to.window(windows[-1])
time.sleep(2)
pages = deiver.find_element_by_xpath('/html/body/div[7]/div[1]/div[1]/div[1]/div/span').text #获取一共有多少页商品
pages = pages.encode("utf-8")
pages = int(pages)
page = pages/60 + 1
all_goods = []
all_url_goods = []#所有商品的url
for aa in range(1,page):
#    print aa
    a = 'https://list.jd.com/list.html?cat=12218,12221&page='#*****需要手动输入cat=12218,12221,cat后值
    b = '&sort=sort_rank_asc&trans=1&JL=6_0_0#J_main'
    bb = aa
    x = '%s%d%s'%(a,bb,b)
    jd = urllib2.urlopen(x)#访问生鲜下商品列表页
    html = jd.read()
    soup = BeautifulSoup(html,'html.parser')
    list = soup.select('div[class="p-name"]')
    new_list1 = []
    name = []
    for i in range(len(list)):
        try :
            b = re.findall('">\n<a href="(.*)'" target=",str(list[i]))[0].decode()#匹配商品列表页的url
            new_list1.append(b)#匹配获取商品列表页一页的60个url地址,并添加到new_list1下
        except Exception as e:
            pass
    print('第%s页'%aa)
    all_url_goods.append(new_list1)
print('url获取结束,开始获取规格名称')
all_url_goods = flatten(all_url_goods)
#print(all_url_goods)
#获取page页数,并将所有页数下的url添加到all_url_goods

for i in all_url_goods:
    x = 'http:'
    i = i.replace('"','')
    xx = '%s%s'%(x,i)
    goods = urllib2.urlopen(xx)#访问商品列表页每个商品的url
    html =goods.read()
    soup = BeautifulSoup(html,'lxml')
    list1 = soup.findAll(attrs={'data-sku':True})
    goods_url = []
    for i in range(len(list1)):
        a = re.findall('data-sku="(.*)" data-value="',str(list1[i]))#匹配sku
        goods_url.append(a)
    goods_url=flatten(goods_url)
    #print(goods_url)
    for i in range(len(goods_url)):
        a = 'https://item.jd.com/'
        b =int(goods_url[i])
        c = '.html'
        last_url = '%s%d%s'%(a,b,c)
        html = urllib2.urlopen(last_url)#访问商品详情页的各个规格
        soup = BeautifulSoup(html,'lxml')
        last_list =soup.select('div[class="sku-name"]')#匹配名称
        for i in range(len(last_list)):
            re_goodsname = last_list[i].string
            name.append(re_goodsname)
print(name)
print(len(name))

#all_goods = flatten(all_goods)
work_excel = xlwt.Workbook()
sheet1 = work_excel.add_sheet(u"sheet1",cell_overwrite_ok= True)
for i in range(len(name)):
    sheet1.write(i,0,name[i])
    #print i
work_excel.save('xinxianshuiguo.xls')

欢迎大神给与提点。另外想问一下,异步加载的数据,可不可以不直接使用time.sleep()?有没有别的方法,类似selenium中显式等待的?

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值