使用selenium3+bs4
思路:淘宝网址,如果通过selenium去访问,就会要求我们登录,我们可以让程序睡一会,自己手动扫码登录,就可以获取到商品的网页数据了,然后通过bs4解析获得商品数据。
import time
from selenium import webdriver
from bs4 import BeautifulSoup
class SpiderProduct(object):
def __init__(self,url,obj,fp,end_page,driver):
self.fp = fp
self.end_page = end_page
self.start_page = 1
self.url = url
self.obj = obj
self.driver = driver
# 入口,
def start(self):
# options = webdriver.ChromeOptions()
# options.add_argument('headless')
# driver = webdriver.Chrome(options=options)
# driver = webdriver.Chrome()
self.driver.get(self.url)
self.driver.implicitly_wait(10)
self.driver.find_element_by_id('q').send_keys(self.obj)
self.driver.find_element_by_class_name('btn-search').click()
# 自己扫码登录
time.sleep(10)
self.get_html()
# 获取网页数据
def get_html(self):
page_data = self.driver.page_source
self.get_product_detail(page_data)
self.start_page +=1
if self.start_page > self.end_page:
return
self.get_to_next_page()
# 下一页
def get_to_next_page(self):
netx_page = self.driver.find_element_by_class_name('next')
# print(netx_page)
# 下一页没有点击事件,添加一个点击事件
self.driver.execute_script("arguments[0].click()", netx_page)
# 等待下一页的内容加载出来
time.sleep(5)
self.get_html()
# 获取商品信息
def get_product_detail(self,page_data):
soup = BeautifulSoup(page_data, 'lxml')
# 把一页的所有商品信息取出来(html)
one_page_products = soup.find_all(class_='J_MouserOnverReq')
product = []
for one_product in one_page_products:
productImg = one_product.find(class_="J_ItemPic").get('src')
productPrice = one_product.find(class_="g_price").get_text().strip()[1:]
productTitle = one_product.find(class_="row-2").get_text().strip()
productShop = one_product.find(class_="shopname").get_text().strip()
productStatus = one_product.find(class_="deal-cnt").get_text().strip()[:-3]
productLocation = one_product.find(class_='row-3').find(class_='location').get_text().strip()
print(productPrice)
print(productTitle)
print(productShop)
print(productStatus)
print(productLocation)
time.sleep(0.5)
# name = input()
product_one = {
'图片':productImg,
'价格':productPrice,
'标题':productTitle,
'店铺':productShop,
'销量':productStatus,
'地点':productLocation,
}
product.append(product_one)
self.save_to_file(product)
# name = input()
# 将商品信息保存到文件中
def save_to_file(self,product):
print(len(product))
self.fp.write(str(product)+'\n')
if __name__ == '__main__':
fp = open('taobao.json','a',encoding='utf-8')
# url = 'https://www.tmall.com/'
url = 'https://www.taobao.com/'
obj = '手机'
end_page = 10
driver = webdriver.Chrome()
try:
sp = SpiderProduct(url,obj,fp,end_page,driver)
sp.start()
finally:
fp.close()
driver.close()