手机版微博crawler学习使用过程记录

环境:python3.6  用途:爬取博文和单张图片

 

学习自:https://blog.csdn.net/Asher117/article/details/82793091?utm_source=blogxgwz1

 

import time
import re
import urllib

from bs4 import BeautifulSoup
from selenium import webdriver
import urllib.request

try:
     print(u'登陆新浪微博手机端...')
     ##打开Chroime浏览器
     browser = webdriver.Chrome()
     ##给定登陆的网址
     url = 'https://passport.weibo.cn/signin/login'
     browser.get(url)
     time.sleep(3)
     #找到输入用户名的地方,并将用户名里面的内容清空,然后送入你的账号
     username = browser.find_element_by_css_selector('#loginName')
     time.sleep(2)
     username.clear()
     username.send_keys('XXX账号XXX@XXX.com')
     #输入自己的账号 #找到输入密码的地方,然后送入你的密码
     password = browser.find_element_by_css_selector('#loginPassword')
     time.sleep(2)
     password.send_keys('XXX密码XXXX')
     #点击登录
     browser.find_element_by_css_selector('#loginAction').click()
     ##这里给个15秒非常重要,因为在点击登录之后,新浪微博会有个九宫格验证码,下图有,通过程序执行的话会有点麻烦(可以参考崔庆才的Python书里面有解决方法),这里就手动
     time.sleep(15)
except:
    print('########出现Error########')
finally:
    print('完成登陆!')



#本文是以GUCCI为例’
id = 'u/1934738161'
niCheng = id
#用户的url结构为
url = 'http://weibo.cn/' + id
browser.get(url)
time.sleep(3)
#使用BeautifulSoup解析网页的HTML
soup = BeautifulSoup(browser.page_source, 'lxml')

#爬取用户的uid信息
uid = soup.find('td',attrs={'valign':'top'})
uid = uid.a['href']
uid = uid.split('/')[1]
# 爬取最大页码数目
pageSize = soup.find('div', attrs={'id': 'pagelist'})
pageSize = pageSize.find('div').getText()
pageSize = (pageSize.split('/')[1]).split('页')[0]

pageSize=int(pageSize)


#爬取微博数量
divMessage = soup.find('div',attrs={'class':'tip2'})
weiBoCount = divMessage.find('span').getText()
weiBoCount = (weiBoCount.split('[')[1]).replace(']','')
#爬取关注数量和粉丝数量
a = divMessage.find_all('a')[:2]
guanZhuCount = (a[0].getText().split('[')[1]).replace(']','')
fenSiCount = (a[1].getText().split('[')[1]).replace(']', '')


#通过循环来抓取每一页数据

file = r'E:\test.txt'


for i in range(1, pageSize+1): # pageSize+1
     x=1
     #每一页数据的url结构为 url = 'http://weibo.cn/' + id + ‘?page=’ + i
     # url = 'https://weibo.cn/GUCCI?page=' + str(i)
     url ='https://weibo.cn/u/1942745225?page=' + str(i)
     browser.get(url)
     time.sleep(1)
     #使用BeautifulSoup解析网页的HTML
     soup = BeautifulSoup(browser.page_source, 'lxml')
     body = soup.find('body')
     divss = body.find_all('div', attrs={'class': 'c'})[1:-2]
     for divs in divss:
          # yuanChuang : 0表示转发,1表示原创
          yuanChuang = '1'#初始值为原创,当非原创时,更改此值
          div = divs.find_all('div')
          #这里有三种情况,两种为原创,一种为转发
          if (len(div) == 2):#原创,有图
               #爬取微博内容
               content = div[0].find('span', attrs={'class': 'ctt'}).getText()

               # #删除 文本中的连接
               # pattern = re.compile(r'点击http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+ ') # 匹配模式
               # texturl = re.findall(pattern,str(content))
               # content= re.sub(texturl[0], "", content)
               # print(content)


               #寻找图片
               tupian=div[1].find('a')
               tupian=tupian.find('img')
               pic_url = re.findall('src="(.*?)"', str(tupian), re.S)
               print(pic_url)
               #将图片写文件
               try:
                    if(urllib.request.urlretrieve(pic_url[0],'E:\image\%s.%s.jpg' % (i,x))):
                         # print("%s.%s"%(i,x))
                         #将文本写文件
                         with open(file, 'a+',encoding='utf-8') as f:
                              f.write(str(i)+"."+str(x)+" "+content+"\n")
                         x+=1
               except:
                    pass


               # path = r"E:\image\img"+i+".jpg"
               # with open (path,"wb") as f :
               #     f.write(response.read())


               # aa = div[1].find_all('a')
               # for a in aa:
               #      text = a.getText()
               #      if (('赞' in text) or ('转发' in text) or ('评论' in text)):
               #      #爬取点赞数
               #           if ('赞' in text):
               #                dianZan = (text.split('[')[1]).replace(']', '')
               #           #爬取转发数
               #           elif ('转发' in text): \
               #                zhuanFa = (text.split('[')[1]).replace(']', '')
               #           #爬取评论数目
               #           elif ('评论' in text): \
               #                pinLun = (text.split('[')[1]).replace(']', '')
               # #爬取微博来源和时间
               # span = divs.find('span', attrs={'class': 'ct'}).getText()
               # faBuTime = str(span.split('来自')[0])
               # laiYuan = span.split('来自')[1]


          # elif (len(div) == 1):#原创,无图
          #      content = div[0].find('span', attrs={'class': 'ctt'}).getText()
          #      aa = div[0].find_all('a')
          #      for a in aa:
          #           text = a.getText()
          #           if (('赞' in text) or ('转发' in text) or ('评论' in text)):
          #                if ('赞' in text):
          #                     dianZan = (text.split('[')[1]).replace(']', '')
          #                elif ('转发' in text):
          #                     zhuanFa = (text.split('[')[1]).replace(']', '')
          #                elif ('评论' in text):
          #                     pinLun = (text.split('[')[1]).replace(']', '')
          #      span = divs.find('span', attrs={'class': 'ct'}).getText()
          #      faBuTime = str(span.split('来自')[0])
          #      laiYuan = span.split('来自')[1]
          #      #这里为转发,其他和上面一样
          # elif (len(div) == 3):#转发的微博
          #      yuanChuang = '0'
          #      content = div[0].find('span', attrs={'class': 'ctt'}).getText()
          #      aa = div[2].find_all('a')
          #      for a in aa:
          #           text = a.getText()
          #           if (('赞' in text) or ('转发' in text) or ('评论' in text)):
          #                if ('赞' in text):
          #                     dianZan = (text.split('[')[1]).replace(']', '')
          #                elif ('转发' in text):
          #                     zhuanFa = (text.split('[')[1]).replace(']', '')
          #                elif ('评论' in text):
          #                     pinLun = (text.split('[')[1]).replace(']', '')
          #      span = divs.find('span', attrs={'class': 'ct'}).getText()
          #      faBuTime = str(span.split('来自')[0])
          #      laiYuan = span.split('来自')[1]
     time.sleep(2)
     print(i)

 

 

 

 

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值