1 from selenium import webdriver 2 from selenium.webdriver.common.by import By 3 from selenium.webdriver.support import expected_conditions as EC 4 from selenium.webdriver.support.wait import WebDriverWait 5 from selenium.common.exceptions import TimeoutException 6 from pyquery import PyQuery as pq 7 import re 8 from config import * 9 import pymongo 10 11 client = pymongo.MongoClient(MONGO_URL) 12 db = client[MONGO_DB] 13 browser = webdriver.Chrome() 14 15 wait = WebDriverWait(browser, 10) 16 17 18 def search(): 19 try: 20 browser.get('https://www.taobao.com') 21 input_ = wait.until( 22 EC.presence_of_element_located((By.CSS_SELECTOR, '#q')) 23 ) 24 submit = wait.until( 25 EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')) 26 ) 27 28 input_.send_keys('xiaomi') 29 submit.click() 30 31 total = wait.until( 32 EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total')) 33 ) 34 get_products() 35 return total.text 36 except TimeoutException: 37 return search() 38 39 def next_page(page_num): 40 try: 41 input_ = wait.until( 42 EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input')) 43 ) 44 submit = wait.until( 45 EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')) 46 ) 47 input_.clear() 48 input_.send_keys(page_num) 49 submit.click() 50 wait.until(EC.text_to_be_present_in_element( 51 (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_num))) 52 get_products() 53 except TimeoutException: 54 next_page(page_num) 55 56 def get_products(): 57 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item'))) 58 html = browser.page_source 59 doc = pq(html) 60 items = doc('#mainsrp-itemlist .items .item').items() 61 for item in items: 62 product ={ 63 'image': item.find('.pic .img').attr('src'), 64 'price': item.find('.price').text(), 65 'deal': item.find('.deal-cnt').text()[:-3], 66 'title': item.find('.title').text(), 67 'shop': item.find('.shop').text(), 68 'location': item.find('.location').text() 69 70 } 71 print(product) 72 save_to_mongo(product) 73 74 def save_to_mongo(result): 75 try: 76 if db[MONGO_TABLE].insert(result): 77 print('success save to mongodb', result) 78 except Exception: 79 print('error to mongo') 80 81 def main(): 82 total = search() 83 total = int(re.compile('(\d+)').search(total).group(1)) 84 # print(total) 85 for i in range(2, total): 86 next_page(i) 87 browser.close() 88 89 if __name__ == '__main__': 90 main()
config.py
1 MONGO_URL = 'localhost' 2 MONGO_DB = 'taobao' 3 MONGO_TABLE = 'product'
运行结果:
数据库: