from selenium import webdriver
from bs4 import BeautifulSoup
import time
#selenium模拟登陆知乎
browser=webdriver.Chrome()
browser.get('http://www.zhihu.com#signin')
browser.find_element_by_class_name('qrcode-signin-cut-button').click()
browser.find_element_by_name('account').send_keys('18251552002')
browser.find_element_by_name('password').send_keys('000189')
time.sleep(5) #手动输入验证码
browser.find_element_by_class_name('sign-button').click()
time.sleep(2)
browser.get('http:www.zhihu.com')
browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(2) #模拟下拉获取ajax加载的内容
# browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
# time.sleep(2)
soup=BeautifulSoup(browser.page_source,'lxml')
titles=soup.find_all('h2',class_='ContentItem-title')
news=[]
for title in titles:
info={}
info['title']=title.find('a').get_text()
print(info)
news.append(info)
print(len(news))
# LOL Seleium
def seleium_crawl(start_page,need_page):
browser = webdriver.Chrome()
browser.get("http://lol.qq.com/guide/list.shtml")
already_page=0
time.sleep(1)
browser.find_element_by_css_selector('#list_page input').clear()
time.sleep(1)
browser.find_element_by_css_selector('#list_page input').send_keys('%d'%start_page)
time.sleep(1)
browser.find_element_by_css_selector('.pagejump').click()
time.sleep(2)
for i in range(start_page,start_page+need_page):
html=browser.page_source
soup=BeautifulSoup(html,'lxml')
all_news=soup.find('ul',id='list_content').find_all('li')
for news in all_news:
new_info={}
new_info['title']=news.find('p',class_='btn-a').get_text()
new_info['read_num']=news.find('p',class_='bfl-playing').get_text()[4:]
new_info['time']=news.find('span',class_='recommend-div-div-raiders-date fr').get_text()
print(new_info)
print('第%d页'%(start_page+already_page))
already_page += 1
try:
browser.find_element_by_class_name('pagenext').click()
time.sleep(1)
except:
break
browser.close()