python抓取微博数据_[Python爬虫] 之四：Selenium 抓取微博数据

最新推荐文章于 2021-09-30 09:22:49 发布

weixin_39689347

最新推荐文章于 2021-09-30 09:22:49 发布

阅读量212

点赞数

文章标签： python抓取微博数据

抓取代码：

# coding=utf-8

import os

import re

from selenium import webdriver

import selenium.webdriver.support.ui as ui

from selenium.webdriver.common.keys import Keys

import time

from selenium.webdriver.common.action_chains import ActionChains

import IniFile

class weibo:

def __init__(self):

#通过配置文件获取IEDriverServer.exe路径

configfile = os.path.join(os.getcwd(),'config.conf')

cf = IniFile.ConfigFile(configfile)

IEDriverServer = cf.GetValue("section", "IEDriverServer")

#每抓取一页数据延迟的时间，单位为秒，默认为5秒

self.pageDelay = 5

pageInteralDelay = cf.GetValue("section", "pageInteralDelay")

if pageInteralDelay:

self.pageDelay = int(pageInteralDelay)

os.environ["webdriver.ie.driver"] = IEDriverServer

self.driver = webdriver.Ie(IEDriverServer)

def scroll_top(self):

'''

滚动条拉到顶部

:return:

'''

if self.driver.name == "chrome":

js = "var q=document.body.scrollTop=0"

else:

js = "var q=document.documentElement.scrollTop=0"

return self.driver.execute_script(js)

def scroll_foot(self):

'''

滚动条拉到底部

:return:

'''

if self.driver.name == "chrome":

js = "var q=document.body.scrollTop=10000"

else:

js = "var q=document.documentElement.scrollTop=10000"

return self.driver.execute_script(js)

def printTopic(self,topic):

print '原始数据： %s' % topic

print ' '

author_time_nums_index = topic.rfind('@')

ht = topic[:author_time_nums_index]

ht = ht.replace('\n', '')

print '话题： %s' % ht

author_time_nums = topic[author_time_nums_index:]

author_time = author_time_nums.split('ñ')[0]

nums = author_time_nums.split('ñ')[1]

pattern1 = re.compile(r'\d{1,2}分钟前|今天\s{1}\d{2}:\d{2}|\d{1,2}月\d{1,2}日\s{1}\d{2}:\d{2}')

time1 = re.findall(pattern1, author_time)

print '话题作者： %s' % author_time.split(' ')[0]

# print '时间： %s' % author_time.split(' ')[1]

print '时间： %s' % time1[0]

print '点赞量： %s' % nums.split(' ')[0]

print '评论量： %s' % nums.split(' ')[1]

print '转发量： %s' % nums.split(' ')[2]

print ' '

def CatchData(self,listClass,firstUrl):

'''

抓取数据

:param id: 要获取元素标签的ID

:param firstUrl: 首页Url

:return:

'''

start = time.clock()

#加载首页

wait = ui.WebDriverWait(self.driver, 20)

self.driver.get(firstUrl)

#打印标题

print self.driver.title

# # 聚焦元素

# target = self.driver.find_element_by_id('J_ItemList')

# self.driver.execute_script("arguments[0].scrollIntoView();", target)

#滚动5次滚动条

Scrollcount = 5

while Scrollcount > 0:

Scrollcount = Scrollcount -1

self.scroll_foot() #滚动一次滚动条，定位查找一次

total = 0

for className in listClass:

time.sleep(10)

wait.until(lambda driver: self.driver.find_elements_by_xpath(className))

Elements = self.driver.find_elements_by_xpath(className)

for element in Elements:

print ' '

txt = element.text.encode('utf8')

self.printTopic(txt)

total = total + 1

self.driver.close()

self.driver.quit()

end = time.clock()

print ' '

print "共抓取了: %d 个话题" % total

print "整个过程用时间: %f 秒" % (end - start)

# #测试抓取微博数据

obj = weibo()

#pt_li pt_li_2 S_bg2

#pt_li pt_li_1 S_bg2

# firstUrl = "http://weibo.com/?category=0"

firstUrl = "http://weibo.com/?category=1760"

listClass = []

listClass.append("//li[@class='pt_li pt_li_1 S_bg2']")

listClass.append("//li[@class='pt_li pt_li_2 S_bg2']")

obj.CatchData(listClass,firstUrl)

登录窗口

deflongon(self):

flag=Truetry:

self.driver.get('https://weibo.com/')

self.driver.maximize_window()

time.sleep(2)

accname= self.driver.find_element_by_id("loginname")

accname.send_keys('username')

accpwd= self.driver.find_element_by_name("password")

accpwd.send_keys('password')

submit= self.driver.find_element_by_xpath("//div[@class='info_list login_btn']/a")

submit.click()

time.sleep(2)exceptException as e1:

message=str(e1.args)

flag=Falsereturn flag

weixin_39689347

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫