- 基本原理:
- 主要利用selenium进行提取页面信息并不断点击下一页
- 代码如下:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
def spider(url, keyword):
driver = webdriver.Chrome()
driver.get(url)
input_tag = driver.find_element_by_id('key')
input_tag.send_keys(keyword)
input_tag.send_keys(Keys.ENTER)
time.sleep(5)
get_goods(driver)
def get_goods(driver):
goods = driver.find_elements_by_class_name('gl-item')
for good in goods:
link = good.find_element_by_tag_name('a').get_attribute('href')
name = good.find_element_by_class_name('p-name em').text.replace('\n', '')
price = good.find_element_by_class_name('p-price i').text
commit = good.find_element_by_class_name('p-commit a').text
msg = '''
商品:%s
链接:%s
价格:%s
评论:%s
''' %(name, link, price, commit)
print(msg)
button = driver.find_element_by_partial_link_text('下一页').click()
time.sleep(2)
get_goods(driver)
spider('https://www.jd.com/', keyword='口罩')