主要使用selenium爬取有货网鞋靴类商品数据,运用mongodb对数据进行存储,数据处理部分参照yohobuy数据处理
#spider.py
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
from config import *
import pymongo
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
browser = webdriver.Chrome()
wait = WebDriverWait(browser, 10)
browser.set_window_size(1400, 900)
def search():
print('正在搜索')
try:
browser.get('https://www.yohobuy.com')
close = wait.until(
EC.element_to_be_clickable((By.XPATH, '//*[@id="cover"]/div[2]/a'))
)
close.click()
shoes = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#yoho-header > div.nav-wrapper.clearfix > div > ul.sub-nav-list.boys.cure > li:nth-child(4) > a'))
)
shoes.click()
close1 = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, 'body > div.product-page.yoho-page.product-list-page.static-list-page > div > div.list-right.pull-right > div.filter-box > div.checked-conditions.section > div > a.tag > i'))
)
close1.click()
total = wait.until((EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.product-page.yoho-page.product-list-page.static-list-page > div > div.list-right.pull-right > div.sort-pager > div > p > span:nth-child(2)'))))
get_products()
return total.text
except TimeoutException:
return search()
def next_page(total_page):
for i in range(2, total_page+1):
print('正在翻页', i)
fan = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, 'body > div.product-page.yoho-page.product-list-page.static-list-page > div > div.list-right.pull-right > div.goods-container.clearfix > div.block-next-page > a > img'))
)
get_products()
fan.click()
def get_products():
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.product-page .goods-container .good-info')))
html = browser.page_source
doc = pq(html)
items = doc('.product-page .goods-container .good-info').items()
for item in items:
product = {
'url': item.find('.good-thumb').attr('href'),
'brand': item.find('.brand ').text(),
'deal': int(item.find('.price').text().split('.')[0][1:]),
'title': item.find('.good-detail-text').text().split('\n')[0]
}
print(product)
save_to_mongo(product)
def save_to_mongo(result):
try:
if db[MONGO_TABLE].insert(result):
print("存储到MONGODB成功", result)
except Exception:
print("保存到MONGODB失败", result)
def main():
total = search()
total = int(total[2:])
next_page(total)
browser.close()
if __name__ == '__main__':
main()
#config.py
MONGO_URL = 'localhost'
MONGO_DB = 'yohobuy'
MONGO_TABLE = 'product'