头条新闻微博号爬虫

头条新闻微博号爬虫

为了防止遗忘,以及学弟写的代码比较好用,将在头条新闻的微博的爬虫新闻代码进行总结,2021-1-12号代码测试可用

前言

头条新闻微博号是新闻源(https://weibo.com/breakingnews)
爬取近两年该微博号发布的新闻,包括新闻发布时间,新闻标题及内容,图片,新闻视频链接,以及前15页评论。
在这里插入图片描述

代码设计

1.代码功能

code目录下有getcode.py,spider.py,其中getcode.py用来获取微博的事件ID,spider.py用来获取对应微博的文章内容,包括图片和视频链接。
如链接:https://weibo.com/1618051664/JCluY5bxf?filter=hot&root_comment_id=0&type=comment#_rnd1610195729508

其中JCluY5bxf即为事件ID

2.文件存储

爬取到的事件ID会按照年/月code.txt的格式存储至EventIDNums目录下,例如2019/12code.txt

爬取到的微博会按照年/月/ID.txt的格式存储到blogs目录下,图片则会存储至年/月/ID目录下

3.运行前准备

1.环境的配置,确保安装selenium,对应版本的chromedriver,requests,bs4,lxml等解析库(安装自行百度)

2.更改文件中的相关内容

3.在SPIDER文件夹下运行命令,如python3 code/getcode.py

4.其他注意事项

1.目前已经爬取过ID的年份有头条新闻2019,2018,2020

2.spider.py经测试可以直接丢到服务器上运行,目前没遇到封号封IP的情况,getcode.py因为需要点击登录,所以只能在图形界面上操作,仅推荐使用手机获取验证码登录,每人每天3次机会,getcode爬取过程中尽量避免点击模拟的浏览器界面,同时目前测试下来,代码本身不会遇到大的问题,出现问题一般是因为网络情况波动,所以尽量放在网络环境较好的环境下运行代码,代码运行终止就再接着点击运行一次即可。

3.需要手动在EventIDNums目录下创建对应年份文件夹

5.运行代码命令(SPIDER目录下)

python3 code/getcode.py
nohup python3 code/gettime.py >> test.log 2>&1 &(每次运行最好先删除test.log)

代码

getcode.py

#author:timevshow
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time
import re
import os
year = '2020'#此处修改年份
loginUrl = 'https://passport.weibo.cn/sigin/login'
baseUrl = 'https://www.weibo.com/breakingnews?is_all=1&stat_date='+year
browser = webdriver.Chrome()
browser.implicitly_wait(5)
wait = WebDriverWait(browser,10)
scriptT = ""
scriptF = "window.scrollTo(document.body.scrollHeight,0)"
filePath = "EventIDNums/"+year
breakpointfile = "breakpoint.txt"#判定断点文件
breakmonth = 0
breakpages = 0

def windowScroll():#滚动页面
    browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    time.sleep(6)
    browser.execute_script("window.scrollTo(document.body.scrollHeight,2*document.body.scrollHeight)")
    time.sleep(6)
    browser.execute_script("window.scrollTo(document.body.scrollHeight,0)")
    time.sleep(6)

def login():#进行登录,使用手机验证码登录,每账号每天可验证3次,24小时后验证次数重置
    browser.get(loginUrl)
    browser.find_element_by_id('loginName').send_keys('')#替换为自己的微博账号
    browser.find_element_by_id('loginPassword').send_keys('')#替换为自己的密码
    time.sleep(1)
    browser.find_element_by_id('loginAction').click()
    time.sleep(60)    
    print("login successfully!!!")

def getIDNums():
    for month in range(breakmonth,0,-1):#按月份
        num = 1
        if(month < breakmonth):
            targetpages = 1  
        else:
            targetpages = breakpages
        #创建Url
        if(month >= 10):
            Url = baseUrl + str(month)
        else:
            Url = baseUrl + "0" + str(month)
        for page in range(targetpages,41):
            url = Url + "&page=" + str(page)
            browser.get(url)
            time.sleep(5)
            print("页面加载完毕,开始滚动页面")
            windowScroll()
            print("开始爬取第"+str(page)+"页")
            #存断点文件
            with open(breakpointfile,'w') as bf:
                bf.write(str(month)+" "+str(page))
            articles = browser.find_elements_by_class_name('WB_feed_handle')
            #开始读取页面
            for article in articles:
                details = article.find_elements_by_css_selector('.pos span')
                shareNum = details[2].text[1:]
                likeNum = details[6].text[1:]
                review = details[4]
                reviewNum = details[4].text[1:]
                review.click()
                time.sleep(1)
                hrefs = browser.find_elements_by_css_selector('.list_ul a')
                if(len(hrefs) == 0):
                    continue
                href = hrefs[len(hrefs) - 1].get_attribute("href")
                if(href is None):
                    continue
                code = re.search("\d/\w.*\?",href)
                if(code is None):
                    continue
                #获取本篇微博代码
                code = code.group(0)[2:-1]
                review.click()
                #写文件
                with open(filePath+"/"+str(month)+"code.txt",'a') as file:
                    file.write(str(num)+" "+str(code)+" "+str(reviewNum)+" "+str(shareNum)+" "+str(likeNum)+"\n")
                num = num + 1

def getbreakpoint():#得到断点信息
    if(os.path.exists(breakpointfile)):
        with open(breakpointfile,'r') as bf:
            data = bf.readline()
            breakmonth,breakpages = data.split()
            breakmonth = int(breakmonth)
            breakpages = int(breakpages)
            if(breakpages < 40):
                breakpages = breakpages + 1
            else:
                breakmonth = breakmonth - 1       
                breakpages = 1
    else:
        breakmonth = 12
        breakpages = 1
    return breakmonth,breakpages

if __name__ == "__main__":
    if(not os.path.exists(filePath)):
        os.mkdir(filePath)
    breakmonth,breakpages = getbreakpoint()
    login()#必须登录后才能按月份查询
    getIDNums()#进行对数据的处理

spider.py

#author:timevshow
import requests
from bs4 import BeautifulSoup 
import re
import time
import random
import os
flag = 1 
reviewNum = 0
likeNum = 0
shareNum = 0
src = ''#存取文件的路径
#修改为自己的User-agent,Cookie
headers = {
    'User-agent' : '',
    'Host' : 'weibo.cn',
    'Accept' : 'application/json, text/plain, */*',
    'Accept-Language' : 'zh-CN,zh;q=0.9',
    'Accept-Encoding' : 'gzip, deflate, br',
    'Cookie':'',
    'DNT' : '1',
    'Connection' : 'keep-alive'
}#请求头的书写,包括User-agent,Cookie等,请更换为自己对应的User-agent,以及cookie



def get_one_page(url):#请求函数:获取某一网页上的所有内容
    response = requests.get(url,headers = headers,verify=False)#利用requests.get命令获取网页html
    if response.status_code == 200:#状态为200即为爬取成功
        return response.text#返回值为html文档,传入到解析函数当中
    return None

def getNewsPicture(id):#获取新闻中的所有图片,并存储到对应的年月文件中
    print("开始爬取文章图片")
    picture = requests.get("https://weibo.cn/mblog/picAll/"+id+"?rl=1",headers = headers)
    soup = BeautifulSoup(picture.text,'lxml')
    pictures = soup.select('img')
    if(pictures is not None):
        if not os.path.exists(src+str(id)):
            os.mkdir(src+str(id))
        filePath = src+str(id)+"/"
        for i in range(0,len(pictures)):
            response = requests.get(pictures[i].attrs['src'])
            if(response.status_code == 200):
                with open(filePath+str(i)+".jpg",'wb') as file:
                    file.write(response.content)
        print("爬取图片完毕")

def getPages(id):
    url = "https://weibo.cn/1618051664/"+id+"?page=1"#生成访问的对应url
    html = get_one_page(url)
    print('正在爬取第1页评论')
    parse_one_page(html,src+id+".txt")
    print("解析完毕")
    soup = BeautifulSoup(requests.get(url,headers = headers).text,'lxml')
    txt = soup.find(class_="pa")
    if(txt is None):
        return 0
    else:
        txt = txt.text.split()[-1:][0]
    pages = int(txt.split('/')[1][:-1])
    time.sleep(2)
    return pages
        

def parse_one_page(html,filePath):#解析html并存入到文档result.txt中
    file = open(filePath,'a+')
    soup = BeautifulSoup(html,'lxml')
    news = soup.find(id="M_")
    flag = 0
    if(news is not None):
        newC = news.text.split()
        links = news.select('a')
        content = ''
        for i in range(1,len(newC) - 6):
            content += newC[i]
        content = content[1:]
        file.write("time:\n"+newC[-6]+" "+newC[-5]+"\n")
        file.write("news:\n"+content+"\n")
        if(links is not None):
            for eachone in links:
                if(re.match('.*/video/.*',eachone.attrs['href'])):
                    print(eachone.attrs['href'])
                    file.write("video:\n"+eachone.attrs['href']+"\n")
                    break
        file.write("data:\n"+"shareNum:"+str(shareNum)+"reviewNum:"+str(reviewNum)+"likeNum:"+str(likeNum)+"\n")
        flag = 1
    if(flag == 1):
        file.write("review:\n")
    review = soup.find_all(class_="ctt")[flag:]
    for eachone in review:
        file.write(eachone.text+"\n")

if __name__ == '__main__':
    flag = 0
    codedir = 'EventIDNums'
    blogdir = 'blogs/'
    year = os.listdir(codedir)
    for eachy in year:
        list = os.listdir(codedir+"/"+eachy)
        for eachym in list:
            m = int(eachym.replace('code.txt',''))
            if not os.path.exists(blogdir + str(eachy)):
                os.mkdir(blogdir + str(eachy))
            src = blogdir + str(eachy)+"/"+str(m)
            if not os.path.exists(src):
                os.mkdir(src)
            src = src + "/"
            print(src)
            if(m == 5 or m == 6 or m == 7 or m == 9 or m == 12):
                continue
            lines = open(codedir+"/"+eachy+"/"+eachym, 'r').readlines()
            for j in range(0,len(lines)):
                print("第"+str(m)+"月"+str(j)+">>>>>>>>"+str(len(lines)))
                index, id, reviewNum ,shareNum ,likeNum= lines[j].split()
                if(os.path.exists(src+id+".txt")):
                    print("已存在")
                    continue
                content = ''
                getNewsPicture(id)
                pages = min(15,getPages(id))#获取评论的页数,控制最多只读取到15页
                flag = 1
                for i in range(2,pages + 1):  # 页数
                    url = "https://weibo.cn/1618051664/"+id+"?page="+str(i)
                    html = get_one_page(url)
                    print('正在爬取第 %d 页评论' % (i))
                    parse_one_page(html,src+id+".txt")
                    print("解析完毕")
                    time.sleep(3)

总结

以上就是对头条新闻这个微博号进行爬虫的代码。

  • 2
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值