【spider】Python爬虫爬取Yohobuy网站数据

主要使用selenium爬取有货网鞋靴类商品数据,运用mongodb对数据进行存储,数据处理部分参照yohobuy数据处理

#spider.py
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
from config import *
import pymongo

client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]

browser = webdriver.Chrome()
wait = WebDriverWait(browser, 10)

browser.set_window_size(1400, 900)


def search():
    print('正在搜索')
    try:
        browser.get('https://www.yohobuy.com')
        close = wait.until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="cover"]/div[2]/a'))
        )
        close.click()
        shoes = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, '#yoho-header > div.nav-wrapper.clearfix > div > ul.sub-nav-list.boys.cure > li:nth-child(4) > a'))
        )
        shoes.click()
        close1 = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'body > div.product-page.yoho-page.product-list-page.static-list-page > div > div.list-right.pull-right > div.filter-box > div.checked-conditions.section > div > a.tag > i'))
        )
        close1.click()
        total = wait.until((EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.product-page.yoho-page.product-list-page.static-list-page > div > div.list-right.pull-right > div.sort-pager > div > p > span:nth-child(2)'))))
        get_products()
        return total.text
    except TimeoutException:
        return search()


def next_page(total_page):
    for i in range(2, total_page+1):
        print('正在翻页', i)
        fan = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'body > div.product-page.yoho-page.product-list-page.static-list-page > div > div.list-right.pull-right > div.goods-container.clearfix > div.block-next-page > a > img'))
        )
        get_products()
        fan.click()


def get_products():
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.product-page .goods-container .good-info')))
    html = browser.page_source
    doc = pq(html)
    items = doc('.product-page .goods-container .good-info').items()
    for item in items:
        product = {
            'url': item.find('.good-thumb').attr('href'),
            'brand': item.find('.brand ').text(),
            'deal': int(item.find('.price').text().split('.')[0][1:]),
            'title': item.find('.good-detail-text').text().split('\n')[0]
        }
        print(product)
        save_to_mongo(product)


def save_to_mongo(result):
    try:
        if db[MONGO_TABLE].insert(result):
            print("存储到MONGODB成功", result)
    except Exception:
        print("保存到MONGODB失败", result)


def main():
    total = search()
    total = int(total[2:])
    next_page(total)
    browser.close()


if __name__ == '__main__':
    main()

#config.py
MONGO_URL = 'localhost'
MONGO_DB = 'yohobuy'
MONGO_TABLE = 'product'

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值