python抓取淘宝数据
- selenium 模拟登录
- 输入关键字并搜索
- xpath解析
这个是属于比较简单的 直接上代码
完整代码如下
from selenium import webdriver
from selenium.webdriver import ChromeOptions
import time
from lxml import etree
#实例化浏览器对象
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation']) #规避检测
bro = webdriver.Chrome(options=option)
# 访问淘宝首页
bro.get('https://www.taobao.com/')
#定位搜索框并输入python
btn = bro.find_element_by_id('q')
btn.click()
btn.send_keys('python')
#定位到搜索按钮并点击
bro.find_element_by_class_name('search-button').click()
time.sleep(1)
# 页面跳转到登录页面
# 定位到账号密码框并分别传入账号密码
bro.find_element_by_id('fm-login-id').send_keys("XXXXX")
time.sleep(1)
bro.find_element_by_id('fm-login-password').send_keys("XXXXX")
time.sleep(0.5)
# 定位到登录按钮并点击登录
bro.find_element_by_class_name('fm-btn').click()
time.sleep(3)
page = bro.page_source #拿到页面源码数据
# 用xpath解析
tree = etree.HTML(page)
# 拿到所用商品列表数据的节点列表
div_list = tree.xpath('//*[@id="mainsrp-itemlist"]/div//div[@class="items"]/div')
#遍历所有节点,进一步解析出想要的数据
for div in div_list:
title = div.xpath('.//div[@class="row row-2 title"]/a//text()')
title = ''.join(title).replace('\n', '').replace(' ','') #清洗掉多余的空格,换行符
price = div.xpath('.//div[@class="price g_price g_price-highlight"]//text()')
price = ''.join(price).replace('\n', '').replace(' ', '')
count = div.xpath('.//div[@class="deal-cnt"]/text()')[0]
shop = div.xpath('.//div[@class="row row-3 g-clearfix"]//text()')
shop = ''.join(shop).replace('\n', '').replace(' ', '')
print(title, price, count, shop)
time.sleep(3)
bro.close() #关闭浏览器
还是要注意操作的时候睡上几秒,太快容易定位不到标签节点
想要采集多页只需加上一个while循环就ok了
n = 0
while n < 50: #想要多少页n就等于多少
page = bro.page_source
tree = etree.HTML(page)
div_list = tree.xpath('//*[@id="mainsrp-itemlist"]/div//div[@class="items"]/div')
for div in div_list:
title = div.xpath('.//div[@class="row row-2 title"]/a//text()')
title = ''.join(title).replace('\n', '').replace(' ', '')
price = div.xpath('.//div[@class="price g_price g_price-highlight"]//text()')
price = ''.join(price).replace('\n', '').replace(' ', '')
count = div.xpath('.//div[@class="deal-cnt"]/text()')[0]
shop = div.xpath('.//div[@class="row row-3 g-clearfix"]//text()')
shop = ''.join(shop).replace('\n', '').replace(' ', '')
print(title, price, count, shop) #我这里只是打印了一下 你可以在这里进行持久行存储
# 定位到‘下一页’的按钮并点击
bro.find_element_by_xpath('//*[@id="mainsrp-pager"]/div/div/div/ul/li[8]/a').click()
n += 1
time.sleep(3)
循环中每次结束让程序睡几秒钟,太快容易被封号!!! 切记