项目简介:
spider_taobao
以下是部分代码,博主没有对数据进行处理
import time
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
from pyquery import PyQuery as pq
from selenium.webdriver.common.action_chains import ActionChains
from config import *
from urllib.parse import quote
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
chrome_options.add_experimental_option("excludeSwitches",["enable-automation"])
browser = webdriver.Chrome(chrome_options)
# 设置等待时常
wait= WebDriverWait(browser, 30)
def login():
url="https://login.taobao.com/member/login.jhtml?redirectURL=https%3a%2f%2fs.taobao.com:443/search%2F_____tmd_____%2Fpage%2Flogin_jump%3Frand%3DS3WxGHAgAt756EpznwfNzJq2AFA2qBNla3j6EINUS8We9dazM_iKElp8DwVSHZUevpC41Bx7RzivXIj9RnZgdg%26_lgt_%3D56187c2d2a96aec483e3f50a4baeeaa1___215918___bad338c181575046793221abbcea4855___eaebc79cac1eb5d2f7d8b4595e00ec73344a42d5a0b8cf56539c823cd24ac06c6b213ffd502da27c5771922daeb449eaeab84d3310934d804dd027a8d75c0275cdb3d5eb3b5d25381a7341f118cdf5120eb265e1c82cd48343995861cd625f2cad31a31f10b4650a1ef2ca0e10e585fa7e062c8e4c21cea1d0aa21ff1c6f4bd9254facd14bb5207b2f4873ebc10a73c154ca108f5af14608caea993432e6050d53b8568a3b95049b3641155db964afbdbd1ca290b8dcdb475a166232a82573f8f74e9970c1432e542a05d1f12eed775d&uuid=56187c2d2a96aec483e3f50a4baeeaa1"
browser.get(url)
loginId=wait.until(EC.presence_of_element_located((By.ID, "fm-login-id")))
loginPassword = wait.until(EC.presence_of_element_located((By.ID, "fm-login-password")))
# 淘宝的账号和密码
loginId.send_keys(账号)
loginPassword.send_keys(密码)
time.sleep(2)
# 检查是否出现了滑动验证码
try:
slider = browser.find_element_by_xpath("//span[contains(@class, 'nc_iconfont btn_slide')]")
if slider.is_displayed():
ActionChains(browser).click_and_hold(on_element=slider).perform()
ActionChains(browser).move_by_offset(xoffset=258, yoffset=0).perform()
ActionChains(browser).pause(0.5).release().perform()
except:
pass
# 点击登录按钮
button = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'password-login')))
button.click()
# 爬取每一页的数据
def index_page(page):
print(f"正在爬取第{page}页")
try:
url = 'https://s.taobao.com/search?q=' + quote(KEYWORD)
print(quote(KEYWORD))
browser.get(url)
if page>1:
# 判断是否存在存在元素
input= wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div > span > input')))
#可见的并且已启用,以便您可以单击它
submit_button= wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'next-pagination-jump-go')))
# 清空页码输入栏的文字
input.clear()
# 再页码输入栏输入页码
input.send_keys(page)
# 点击换页
submit_button.click()
# 等待指定的文本出现在某一个节点里面时即返回成功
wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME,'next-current')))
wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME,'Card--doubleCardWrapper--L2XFE73')))
get_products()
except TimeoutException:
index_page(page)
def get_products():
"""
提取商品
:return:
"""
html = browser.page_source
doc= pq(html)
items=doc('.Card--doubleCardWrapper--L2XFE73').items()
print(items)
for item in items:
product = {
'title': item.find('.Title--title--jCOPvpf').text(),
'image': item.find('.MainPic--mainPic--rcLNaCv').attr('src'),
'price': item.find('.Price--priceInt--ZlsSi_M').text(),
'shoping': item.find('.Price--realSales--FhTZc7U').text(),
'shop': item.find('.ShopInfo--shopName--rg6mGmy').text(),
'location': item.find('.Card--doubleCardWrapper--L2XFE73').attr('href')
}
print(product)
def main():
"""
遍历每一页
:return:
"""
login()
for i in range(1,MAX_PAGE+1):
index_page(i)
time.sleep(20)
if __name__ == '__main__':
main()