from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re
from time import sleep
driver = webdriver.Chrome()
wait = WebDriverWait(driver,10)
def search():
try:
driver.get("https://www.taobao.com/")
intext = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR,"#q"))
)
submit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button"))
)
intext.send_keys('美食')
submit.click()
page_num = wait.until(EC.presence_of_element_located((By.CLASS_NAME,"total")))
return page_num.text
except TimeoutError:
return search()
def next_page(num):
try:
page_text = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > input")))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
page_text.clear()
page_text.send_keys(num)
sleep(2)
submit.click()
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(num)))
return get_products()
except TimeoutError:
next_page(num)
def get_products():
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist .items .item")))
html =driver.page_source
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div',{'class':'item J_MouserOnverReq '})
for item in items:
yield {'title':item.img.attrs['alt'],'price':item.strong.text}
def main():
total = search()
total = int(re.search('\d+',total).group())
for i in range(2,total+1):
ss=next_page(i)
for s in ss:
print(s['title'])
# print(total.split()[1])
if __name__ == '__main__':
main()
python_利用selenium 爬取淘宝商品
最新推荐文章于 2024-08-09 10:04:21 发布