要求:爬虫数据网是向下滚动才能翻完一页的,不滚动鼠标只有半页那种。
# 第一步:自动登录首页并搜索,之后获取第一页的数据内容
is_first=True
def func1():
b.get('https://www.jd.com/')
time.sleep(2)
# 找到搜索图表,搜索手机,并回车
search_button=b.find_element_by_id('key')
search_button.send_keys('手机')
search_button.send_keys(Keys.ENTER)
time.sleep(2)
# 翻页
height=800
for _ in range(12):
b.execute_script(f'window.scrollTo(0,{height})')
height+=800
time.sleep(2)
# 解析数据获取内容
soup1=BeautifulSoup(b.page_source,'lxml')
lis=soup1.select('.gl-warp.clearfix>li')
list1=[]
for li in lis:
price=li.select_one('div.p-price > strong > i').text
try:
shop_name=li.select_one('a.curr-shop.hd-shopname').text
except:
shop_name=''
# print(shop_name)
list1.append([price,shop_name])
# 写入第一页数据
writer1=csv.writer(open('files/京东手机.csv','a',encoding='utf-8',newline=''))
global is_first
if is_first:
writer1.writerow(['价格','店铺名'])
is_first=False
writer1.writerows(list1)
time.sleep(2)
print('================下载成功======================')
# 不属于第一步,暂时不看,看到第三步再回来参考这步
x=1
while x<50:
fun2()
x+=1
# 第二步:点击下一页,来到第二页,继续向下滚动,获取第二页内容...直到最后一页(这是重复操作)
# 封装函数
def fun2():
# 点击下一页
next_btn=b.find_element_by_class_name('pn-next')
next_btn.click()
height = 800
# 向下滚动
for _ in range(12):
b.execute_script(f'window.scrollTo(0,{height})')
height += 800
time.sleep(2)
# 检索内容
soup1 = BeautifulSoup(b.page_source, 'lxml')
lis = soup1.select('.gl-warp.clearfix>li')
list1 = []
for li in lis:
price = li.select_one('div.p-price > strong > i').text
try:
shop_name=li.select_one('a.curr-shop.hd-shopname').text
except:
shop_name=''
list1.append([price, shop_name])
# 因为是第二页开始,所以不需要写入标题行,直接叠上去
writer1 = csv.writer(open('files/京东手机.csv', 'a', encoding='utf-8', newline=''))
writer1.writerows(list1)
time.sleep(2)
print('================下载成功======================')
# 第三步:想要检索N页,则把第二步封装的函数放入第一步,写循环重复第二步
# 详情请看第一步末尾
# 第四步:运行func1函数即 - 第一页+func2的循环