此处使用了selenium插件 使用的是火狐浏览器 信息存储到csv表格里面
前面详细不多讲如果条件不满足自行百度安装 相比上一版本稍微改进 可以输入多词
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 11 20:21:04 2019
@author: Administrator
"""
from selenium import webdriver
import time
import random
import csv
import codecs
li_list = ["洗手液","袜子","卫生纸"]#格式 li_list = ["洗手液","袜子","卫生纸"]
#要爬取的网址 此处网址为搜索详细产品出现的产品搜索结果页
#注意苏宁搜索行业词出现的产品页面是不一样的
wangzhi = "https://search.suning.com/"
#此处为要爬取的页数默认为 50页
yema = 50
#查数用的没啥实际作用
num = 1
#codevs 防止中文写入时乱码
f = codecs.open('123suning.csv','a',encoding='utf-8')
csv_writer = csv.writer(f)
#谷歌
#browser = webdriver.chrom.webdirver.WebDriver(executable_path="chromedriver")
#火狐
browser1 = webdriver.Firefox(executable_path="geckodriver")
def browser_1(url,browser=browser1):
#打开网页
browser.get(url)
'''
cookie_list = [{这里放登陆账号后的cookie 注意跟下面的有区别}]
for item in cookie_list: browser.add_cookie({
'domain': 'search.suning.com',
'name': item['name'],
'value': item['value'],
'path': '/',
'expires': None
})
'''
return(browser)
def browser_2(url,browser=browser1):
#打开网页
browser.get(url)
'''
cookie_list = [{这里放登陆账号后的cookie.注意跟上面的有区别}]
for item in cookie_list: browser.add_cookie({
'domain': 'product.suning.com',
'name': item['name'],
'value': item['value'],
'path': '/',
'expires': None
})
'''
return(browser)
for li in li_list:
print(li)
browser = browser_1(wangzhi)
input_txt = browser.find_element_by_id("sTxt")
input_txt.send_keys(li)
browser.find_element_by_id("sBtn").click()
time.sleep(random.randint(2,5))
try:
browser.find_element_by_class_name("close-btn").click()
except:
print(li)
#通过class找到元素
#input_guanggao = browser.find_element_by_class_name("close-btn")
#点一下
#input_guanggao.click()
#输入
#input_txt.send_keys("111")
#翻页键
#next_page = browser.find_element_by_class_name("next")
#数据提取
urls = []
shopid_list = []
nub = 1
for i in range(yema):
print(i)
#将滚动条拖到底部
js="var q=document.documentElement.scrollTop=100000"
browser.execute_script(js)
time.sleep(random.randint(3,5))
shops = browser.find_elements_by_class_name("sellPoint")
for shop in shops:
#print(shop)
#print("*"*10)
url = shop.get_attribute('href')
vip = "/0000000000/"
#去掉重复店铺
if "https://product.suning.com/" in url:
shopid = url[27:37]
#print(shopid)
if (vip not in url) & (shopid not in shopid_list):
shopid_list.append(shopid)
urls.append(url)
print(i,"--",nub,"--",url)
print(shopid)
nub += 1
print(i,"页")
i += 1
js="var q=document.documentElement.scrollTop=500"
browser.execute_script(js)
time.sleep(random.randint(2,3))
if i == yema:
break
else:
next_page = browser.find_element_by_class_name("next")
time.sleep(random.randint(2,3))
next_page.click()
time.sleep(random.randint(2,5))
print("---"*10)
for ul in urls:
browser_shop = browser_2(ul)
time.sleep(2)
# if browser_shop.find_element_by_id("chead_companyName"):
# driver.quit()
try:
#公司名称
chead_companyName = browser_shop.find_element_by_id("chead_companyName")
#电话
chead_telPhone = browser_shop.find_element_by_id("chead_telPhone")
#地址
chead_companyAddress = browser_shop.find_element_by_id("chead_companyAddress")
browser_shop.find_element_by_id("chead_road").click()
#chead_telPhone.find_element_by_xpath("//*[contains(text(),'13816391436')]").click()
companyName = chead_companyName.text
if companyName == "":
companyName = "null"
telPhone = chead_telPhone.text
if telPhone == "":
telPhone = "null"
companyAddress = chead_companyAddress.text
if companyAddress == "":
companyAddress = "null"
num += 1
print(companyName,"==",telPhone,"==",companyAddress,"/",li,"/",num)
csv_writer.writerow([companyName,telPhone,companyAddress])
except:
print("错误页面")
#browser_shop.close()
f.close()
print("结束")
仅供交流学习!