使用selenium+phaotmjs爬取人民微博
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import urllib.request
from bs4 import BeautifulSoup
import time
driver = webdriver.PhantomJS(executable_path=r'D:\phantomjs\bin\phantomjs.exe')
wait = WebDriverWait(driver,10)
driver.get('http://t.people.com.cn/login.action')
username = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR,'#userName'))
)
username.send_keys('17332335684')
password = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR,'#password'))
)
password.send_keys('zgx675050748')
time.sleep(4)
password.send_keys(Keys.ENTER)
for i in range(0,10):
time.sleep(3)
driver.execute_script('window.scrollTo(0,1000000)')
time.sleep(3)
driver.execute_script('window.scrollTo(0,1000000)')
time.sleep(3)
driver.execute_script('window.scrollTo(0,1000000)')
time.sleep(3)
comments = driver.find_elements_by_xpath("//a[@data-nodetype='btn_comment']")
for comment in comments:
comment.click()
url = driver.page_source
soup = BeautifulSoup(url,'html.parser')
blocks = soup.select('.list_detail')
f = open('renminweibo.txt','a',encoding='UTF-8')
for block in blocks:
name = block.select('.list_user .list_name')[0].text
text = block.select('.list_text')[0].text
print('发布者:')
print(name)
print('内容 :')
print(text)
f.write('发布者:'+'\n')
f.write(name+'\n')
f.write('内容:'+'\n')
f.write(text+'\n')
block3s = block.select('div.comment_text.skin_color_01')
for block3 in block3s:
pinglun = block3.text
print('评论:')
print(pinglun)
f.write('评论:'+'\n')
f.write(pinglun+'\n')
print('-----------------')
f.write('-----------------'+'\n')
f.close()
next = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '.wbp_pagelist_nextbtn'))
)
next.click()
time.sleep(30)
driver.quit()