抓取淘宝美食数据

1. 得到所有页的数量

2. 解析页面得到产品信息

3. 实现翻页面得到所有关于美食的数据

 from selenium import webdriver

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.by import By

from selenium.webdriver.chrome.options import Options

import re

from bs4 import BeautifulSoup

 

# 不加载图片

SERVICE_ARGS = ['--load-images=false', '--disk-cache=false']

# chrome_options=Options()

# chrome_options.add_argument("--headless")

 

driver=webdriver.Chrome(service_args=SERVICE_ARGS)

# driver=webdriver.Chrome(service_args=SERVICE_ARGS,chrome_options=chrome_options)

wait=WebDriverWait(driver,10)

driver.get("https://www.taobao.com/")

driver.set_window_size(width=1500,height=800)

 

 

# 第一步,得到美食的所有页

def get_page_num():

    print("搜索>>>美食")

    input=wait.until(EC.presence_of_element_located((By.ID,"q")))

    input.send_keys("美食")

    driver.find_element_by_css_selector("#J_TSearchForm > div > button").click()

    text= driver.find_element_by_class_name("total").text

    print(text)

    page_num1=re.compile(r"(\d+)").search(text).group(0)

    get_product_into()

return page_num1

 

def get_product_into():

    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist .items .item")))

    soup=BeautifulSoup(driver.page_source,"lxml")

    items_list=soup.select("#mainsrp-itemlist .items .item")

    for item in items_list:

        item_dict={}

        print("*"*100)

        location=item.select(".location")[0].text.strip()

        price=item.select(".price")[0].text.strip()

        shopname=item.select(".shopname")[0].text.strip()

        title=item.select('a[class="J_ClickStat"]')[0].text.strip()

        product_link=item.select(".J_ClickStat")[0].attrs["href"]

        image = item.select('.J_ItemPic.img')[0].attrs["data-src"]

        if not image:

            image=item.select(".J_ItemPic.img")[0].attrs['data-ks-lazyload']

 

        item_dict["image"] = "https:" + image

        item_dict["price"] = price

        item_dict["location"] = location

        item_dict["title"] = title

        item_dict["product_link"] = "https:"+product_link

        item_dict["shopname"] = shopname

 

        print(item_dict)

 

 

 

def next_page(page):

    print("当前正在加载第%s页的数据------------------------------------"%page)

    try:

        input=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > input")))

        input.clear()

        input.send_keys(page)

        driver.find_element_by_css_selector("#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit").click()

        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active"),str(page)))

    except Exception as e:

        print(e)

        next_page(page)

    get_product_into()

 

def main():

    try:

        page_num=get_page_num()

        print("总页数是:",page_num)

        for page in range(2,int(page_num)+1):

            next_page(page)

    except Exception as e:

        print(e)

 

    finally:

        driver.quit()

 

if __name__ == '__main__':

    main()

结果:


阅读更多
想对作者说点什么?

博主推荐

换一批

没有更多推荐了,返回首页