#_*_coding=utf-8 _*_
#__author__ = 'Administrator'
from selenium import webdriver
from bs4 import BeautifulSoup
import sys
import urllib2
import time
from compiler.ast import flatten
import re
import xlwt
reload(sys)
sys.setdefaultencoding('utf-8')
deiver =webdriver.Firefox()
deiver.get('https://fresh.jd.com/')
deiver.find_element_by_xpath('/html/body/div[4]/div/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div/a').click()#需要手动输入div[1]/div/a') ,第一个div值
windows = deiver.window_handles
deiver.switch_to.window(windows[-1]) #点击进入商品列表界面
deiver.switch_to.window(windows[0])
deiver.close()
deiver.switch_to.window(windows[-1])
time.sleep(2)
pages = deiver.find_element_by_xpath('/html/body/div[7]/div[1]/div[1]/div[1]/div/span').text #获取一共有多少页商品
pages = pages.encode("utf-8")
pages = int(pages)
page = pages/60 + 1
all_goods = []
all_url_goods = []#所有商品的url
for aa in range(1,page):
# print aa
a = 'https://list.jd.com/list.html?cat=12218,12221&page='#*****需要手动输入cat=12218,12221,cat后值
b = '&sort=sort_rank_asc&trans=1&JL=6_0_0#J_main'
bb = aa
x = '%s%d%s'%(a,bb,b)
jd = urllib2.urlopen(x)#访问生鲜下商品列表页
html = jd.read()
soup = BeautifulSoup(html,'html.parser')
list = soup.select('div[class="p-name"]')
new_list1 = []
name = []
for i in range(len(list)):
try :
b = re.findall('">\n<a href="(.*)'" target=",str(list[i]))[0].decode()#匹配商品列表页的url
new_list1.append(b)#匹配获取商品列表页一页的60个url地址,并添加到new_list1下
except Exception as e:
pass
print('第%s页'%aa)
all_url_goods.append(new_list1)
print('url获取结束,开始获取规格名称')
all_url_goods = flatten(all_url_goods)
#print(all_url_goods)
#获取page页数,并将所有页数下的url添加到all_url_goods
for i in all_url_goods:
x = 'http:'
i = i.replace('"','')
xx = '%s%s'%(x,i)
goods = urllib2.urlopen(xx)#访问商品列表页每个商品的url
html =goods.read()
soup = BeautifulSoup(html,'lxml')
list1 = soup.findAll(attrs={'data-sku':True})
goods_url = []
for i in range(len(list1)):
a = re.findall('data-sku="(.*)" data-value="',str(list1[i]))#匹配sku
goods_url.append(a)
goods_url=flatten(goods_url)
#print(goods_url)
for i in range(len(goods_url)):
a = 'https://item.jd.com/'
b =int(goods_url[i])
c = '.html'
last_url = '%s%d%s'%(a,b,c)
html = urllib2.urlopen(last_url)#访问商品详情页的各个规格
soup = BeautifulSoup(html,'lxml')
last_list =soup.select('div[class="sku-name"]')#匹配名称
for i in range(len(last_list)):
re_goodsname = last_list[i].string
name.append(re_goodsname)
print(name)
print(len(name))
#all_goods = flatten(all_goods)
work_excel = xlwt.Workbook()
sheet1 = work_excel.add_sheet(u"sheet1",cell_overwrite_ok= True)
for i in range(len(name)):
sheet1.write(i,0,name[i])
#print i
work_excel.save('xinxianshuiguo.xls')
欢迎大神给与提点。另外想问一下,异步加载的数据,可不可以不直接使用time.sleep()?有没有别的方法,类似selenium中显式等待的?