python抓取微博数据_[Python爬虫] 之四:Selenium 抓取微博数据

194720-20170323180952596-1143779510.png

抓取代码:

# coding=utf-8

import os

import re

from selenium import webdriver

import selenium.webdriver.support.ui as ui

from selenium.webdriver.common.keys import Keys

import time

from selenium.webdriver.common.action_chains import ActionChains

import IniFile

class weibo:

def __init__(self):

#通过配置文件获取IEDriverServer.exe路径

configfile = os.path.join(os.getcwd(),'config.conf')

cf = IniFile.ConfigFile(configfile)

IEDriverServer = cf.GetValue("section", "IEDriverServer")

#每抓取一页数据延迟的时间,单位为秒,默认为5秒

self.pageDelay = 5

pageInteralDelay = cf.GetValue("section", "pageInteralDelay")

if pageInteralDelay:

self.pageDelay = int(pageInteralDelay)

os.environ["webdriver.ie.driver"] = IEDriverServer

self.driver = webdriver.Ie(IEDriverServer)

def scroll_top(self):

'''

滚动条拉到顶部

:return:

'''

if self.driver.name == "chrome":

js = "var q=document.body.scrollTop=0"

else:

js = "var q=document.documentElement.scrollTop=0"

return self.driver.execute_script(js)

def scroll_foot(self):

'''

滚动条拉到底部

:return:

'''

if self.driver.name == "chrome":

js = "var q=document.body.scrollTop=10000"

else:

js = "var q=document.documentElement.scrollTop=10000"

return self.driver.execute_script(js)

def printTopic(self,topic):

print '原始数据: %s' % topic

print ' '

author_time_nums_index = topic.rfind('@')

ht = topic[:author_time_nums_index]

ht = ht.replace('\n', '')

print '话题: %s' % ht

author_time_nums = topic[author_time_nums_index:]

author_time = author_time_nums.split('ñ')[0]

nums = author_time_nums.split('ñ')[1]

pattern1 = re.compile(r'\d{1,2}分钟前|今天\s{1}\d{2}:\d{2}|\d{1,2}月\d{1,2}日\s{1}\d{2}:\d{2}')

time1 = re.findall(pattern1, author_time)

print '话题作者: %s' % author_time.split(' ')[0]

# print '时间: %s' % author_time.split(' ')[1]

print '时间: %s' % time1[0]

print '点赞量: %s' % nums.split(' ')[0]

print '评论量: %s' % nums.split(' ')[1]

print '转发量: %s' % nums.split(' ')[2]

print ' '

def CatchData(self,listClass,firstUrl):

'''

抓取数据

:param id: 要获取元素标签的ID

:param firstUrl: 首页Url

:return:

'''

start = time.clock()

#加载首页

wait = ui.WebDriverWait(self.driver, 20)

self.driver.get(firstUrl)

#打印标题

print self.driver.title

# # 聚焦元素

# target = self.driver.find_element_by_id('J_ItemList')

# self.driver.execute_script("arguments[0].scrollIntoView();", target)

#滚动5次滚动条

Scrollcount = 5

while Scrollcount > 0:

Scrollcount = Scrollcount -1

self.scroll_foot() #滚动一次滚动条,定位查找一次

total = 0

for className in listClass:

time.sleep(10)

wait.until(lambda driver: self.driver.find_elements_by_xpath(className))

Elements = self.driver.find_elements_by_xpath(className)

for element in Elements:

print ' '

txt = element.text.encode('utf8')

self.printTopic(txt)

total = total + 1

self.driver.close()

self.driver.quit()

end = time.clock()

print ' '

print "共抓取了: %d 个话题" % total

print "整个过程用时间: %f 秒" % (end - start)

# #测试抓取微博数据

obj = weibo()

#pt_li pt_li_2 S_bg2

#pt_li pt_li_1 S_bg2

# firstUrl = "http://weibo.com/?category=0"

firstUrl = "http://weibo.com/?category=1760"

listClass = []

listClass.append("//li[@class='pt_li pt_li_1 S_bg2']")

listClass.append("//li[@class='pt_li pt_li_2 S_bg2']")

obj.CatchData(listClass,firstUrl)

登录窗口

194720-20181108102615634-2092853351.png

deflongon(self):

flag=Truetry:

self.driver.get('https://weibo.com/')

self.driver.maximize_window()

time.sleep(2)

accname= self.driver.find_element_by_id("loginname")

accname.send_keys('username')

accpwd= self.driver.find_element_by_name("password")

accpwd.send_keys('password')

submit= self.driver.find_element_by_xpath("//div[@class='info_list login_btn']/a")

submit.click()

time.sleep(2)exceptException as e1:

message=str(e1.args)

flag=Falsereturn flag

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值