爬一个人的所有微博 python_python爬去微博获得微博个人页面所有相关信息

最新推荐文章于 2021-09-05 23:46:36 发布

weixin_39737947

最新推荐文章于 2021-09-05 23:46:36 发布

阅读量73

点赞数

文章标签：爬一个人的所有微博 python

[Python] 纯文本查看复制代码#!/usr/bin/evn python

# -*- coding: utf-8 -*-

from urllib import quote,unquote

from selenium import webdriver

from selenium.webdriver.firefox.options import Options

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup

import json

from time import sleep

import re

import codecs

class Weibo(object):

def __init__(self,url=""):

self.url = url

#firefoxoption = webdriver.FirefoxOptions()

#firefoxoption.set_headless()

#self.browser = webdriver.Firefox(firefox_options=firefoxoption)

print("正在打开浏览器,请稍后...")

self.browser = webdriver.Firefox()

def GetPageSource(self,url,mate,callback):

print("正在打开网页,请稍后...")

self.browser.get(url)

wait = WebDriverWait(self.browser,10)

userInfo = wait.until(EC.presence_of_element_located(mate))

return callback(self.browser.page_source)

def GetUserList(self,url):

print ("正在获得找人页面所有匹配到的信息,请稍后...")

retUserList = []

bs = BeautifulSoup(url,"lxml")

userList = bs.select("#pl_user_feedList .list_person")

for user in userList:

userInfo = {

"nickName":user.select(".person_name")[0].a['title'],

"mainPage":"https:" + user.select(".person_name")[0].a['href'],

"Address":user.select(".person_addr > span:nth-of-type(2)")[0].get_text(),

"Card": user.select(".person_card")[0].get_text(strip=True) if user.select(".person_card") else "",

"Num": " ".join(user.select(".person_num")[0].get_text().lstrip().split("\n")),

"PersonInfo":re.sub("[\t\n]","",user.select(".person_info")[0].get_text())

}

retUserList.append(userInfo)

return retUserList

def GetPersonPageContent(self,url):

print("正在或者个人页面信息,请稍后")

bs = BeautifulSoup(url,"lxml")

contentList = bs.select("div[node-type='feed_list'] > div[action-type='feed_list_item']")

retPersonInfoList = []

for i in xrange(len(contentList)) :

try:

contentInfo = {

"id": str(i+1),

"from":contentList[i].select(".WB_from")[0].get_text(strip=True),

"text":contentList[i].select(".WB_text.W_f14")[0].get_text(strip=True),

"videoOrImg":self.GetImageOrVideoPath(contentList[i].select(".WB_media_wrap")[0]) if contentList[i].select(".WB_media_wrap") else ""

}

retPersonInfoList.append(contentInfo)

except:

continue

return retPersonInfoList

def GetImageOrVideoPath(self,source):

media = source.select(".WB_media_a")[0]

url = media.select(".WB_video")

if url:

videoPath = unquote(unquote(url[0]["video-sources"][8:]))

return videoPath

else:

try:

actionData = media["action-data"]

if actionData :

if "pic_ids" in actionData:

data = re.search("clear_picSrc=(.*?)&", actionData)

imageList = [ "https:%s"%(unquote(img)) for img in data.group(1).split(",")]

return ",".join(imageList)

else:

data = re.search("clear_picSrc=(.*?)$", actionData)

return "https:" + unquote(data.group(1))

except KeyError as e:

imagePath = media.select(".WB_pic")[0].a.img["src"]

return imagePath

def SavePersonInfo(self,filename,content):

with codecs.open("./%s.json" % filename, "w+", "utf-8") as f:

for i in content:

f.write(json.dumps(i) + "\n")

def run(self,url):

userList = self.GetPageSource(url,(By.ID,"pl_user_feedList"),self.GetUserList)

if userList:

for i in xrange(len(userList)) :

print ("%d:\t%s\n\t%s\n\t%s\n\t%s\n\t%s\n\t%s\n"%(i+1,userList[i]["nickName"],userList[i]["mainPage"],userList[i]["Address"],userList[i]["Card"],userList[i]["Num"],userList[i]["PersonInfo"]))

else:

return -1

while True:

try:

inputcontent = int(raw_input("请在上面输出的内容中选择需要的选项 1-%d: "%len(userList)))

if inputcontent > 0 and inputcontent <= len(userList):

break

print("请输入数字的范围 1 - %d "%len(userList))

except:

print("请输入数字的范围 1 - %d "%len(userList))

continue

self.browser.execute_script("window.open()")

self.browser.switch_to_window(self.browser.window_handles[1])

userInfo = self.GetPageSource(userList[inputcontent-1]["mainPage"],(By.CSS_SELECTOR,"div[node-type='feed_list']"),self.GetPersonPageContent)

if userInfo :

self.SavePersonInfo(userList[inputcontent-1]["nickName"],userInfo)

def __del__(self):

if self.browser.window_handles:

for hand in self.browser.window_handles:

self.browser.switch_to_window(hand)

self.browser.close()

def main():

name= raw_input("请输入需要搜索的名字 : ")

name = quote(quote(name))

url ="http://s.weibo.com/user/%s&Refer=index"%name

weiboret = Weibo()

weiboret.run(url)

if __name__ == '__main__':

main()

weixin_39737947

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫