#-*- coding:utf-8 -*-#author : yesehngbao#time:2018/3/29
importreimportpymongofrom lxml importetreefrom selenium importwebdriverfrom selenium.webdriver.support importexpected_conditions as ECfrom selenium.webdriver.support.ui importWebDriverWaitfrom selenium.webdriver.common.by importBy#from selenium.webdriver.common.utils import Keys
MONGO_HOST= 'localhost'MONGO_PORT= 27017MONGO_DB= 'test'MONGO_COLL= 'selenum_tao'webdir=webdriver.Chrome()defget_page_num():
webdir.get('http://www.taobao.com')
input= WebDriverWait(webdir, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#q')))
button= WebDriverWait(webdir, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.btn-search')))
input.clear()
input.send_keys('衬衫')
button.click()
page_num= WebDriverWait(webdir, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total'))).text
page_num= re.findall('\d+', page_num)[0]returnpage_numdefgain_page(page):try:
WebDriverWait(webdir,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.items .item .pic a img')))
input= WebDriverWait(webdir, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.J_Input')))
button= WebDriverWait(webdir, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.J_Submit')))
input.clear()
input.send_keys(page)
button.click()
WebDriverWait(webdir,10).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page)))exceptException:
gain_page(page)defget_page_html(page):ifpage:
html=webdir.page_sourcereturnhtmldefanalysis_page(html):
doc=etree.HTML(html)
div_list= doc.xpath('.//div[@class="items"]//div[contains(@class,"item")]')for div indiv_list:
img= div.xpath('.//div[@class="pic"]/a/img/@data-src')[0]
money= div.xpath('.//div[contains(@class, "price")]/strong/text()')[0]yield{'img': img,'money': money,
}defsave_mongo(content):
mongo_client= pymongo.MongoClient(host=MONGO_HOST, port=MONGO_PORT)
db=mongo_client[MONGO_DB]
coll=db[MONGO_COLL]
coll.insert(content)defmain():
page_num=get_page_num()for page in range(1, int(page_num)+1):
gain_page(page)
html=get_page_html(page)
content=analysis_page(html)
save_mongo(content)if __name__ == '__main__':
main()