以下代码为实现从电商网站上自动的获取书籍信息
from selenium import webdriver
import time
from bs4 import BeautifulSoup
base_url = "https://www.jd.com/?cu=true&utm_source=baidu-pinzhuan&utm_medium=cpc&utm_campaign=t_288551095_baidupinzhuan&utm_term=0f3d30c8dba7459bb52f2eb5eba8ac7d_0_e98375277f714ddabd4e98712adec3a9"
with open('data1.txt', 'w', encoding='utf-8') as f:
n = 1
driver = webdriver.Chrome(executable_path='d:\\chromedriver.exe')
driver.get(base_url)
driver.find_element_by_id('key').send_keys('大数据')
time.sleep(3)
driver.find_element_by_class_name('button').click()
while True:
scroll = 1000
for i in range(15):
# '$(window).scrollTop(str(scroll))'
driver.execute_script('var q = document.documentElement.scrollTop={}'.format(scroll))
scroll += 1000
# time.sleep(2)
driver.implicitly_wait(10)
doc = BeautifulSoup(driver.page_source, 'html.parser')
# print(doc)
book_l = doc.select('.gl-warp > li')
for book in book_l:
s = ''
print(n)
# print(book)
book_imge = book.select('.p-img > a > img')[0].get('src')
if book_imge is None:
book_imge = book.select('.p-img > a > img')[0].get('data-lazy-img')
# print(book_imge)
book_price = book.select('.p-price > strong')[0].getText()
# print(book_price)
book_name = book.select('.p-name > a >em')[0].getText()
# print(book_name)
book_com_num = book.select(".p-commit > strong")[0].getText()
# print(book_com_num)
try:
book_press = book.select('.p-shopnum >a')[0].getText()
except:
book_press = '不详'
s += book_name +'\t'+book_press+'\t' + book_imge + '\t' + book_price + '\t' + book_com_num + '\n'
n += 1
f.write(s)
if doc.select('.disabled') != doc.select('pn-next'):
driver.find_element_by_class_name('pn-next').click()
time.sleep(2)
else:
break