爬取微博

1. 首先将破解验证码的那些代码封装成一个模块

sliderCode .py

from selenium import webdriver
from time import sleep
from PIL import Image
from selenium.webdriver import ActionChains

# 辅助函数1、截图
def shot_img(driver):
    sleep(2)
    # 截整个浏览器界面的图
    driver.save_screenshot("./page.png")
    # 加载截图
    img = Image.open("./page.png")
    # 从整张界面图中提取出验证码区域的截图
    loc = driver.find_element_by_class_name("geetest_slicebg").location # 取出验证码区域的位置
    size = driver.find_element_by_class_name("geetest_slicebg").size # 取出验证码区域的大小
    print(loc,size)
    # 计算验证码区域的截图范围
    top = loc["y"]
    left = loc["x"]
    right = loc["x"] + size["width"]
    bottom = loc["y"] + size["height"]

    # 根据截图区域的范围来截取出图片
    code_img = img.crop((left*2,top*2,right*2,bottom*2)) # 经验告诉我乘以2好使
    # code_img.show()
    return code_img
# 辅助函数2、就是缺口的距离
def get_distance(img1,img2):
    # 寻找缺口图和无缺口的图之间像素的差异,只需要找到第一个存在差异的像素取出该像素的x位置就是缺口到起始点的距离
    for i in range(50,img1.size[0]):
        for j in range(img1.size[1]):
            # 加载rgb值
            rgb1 = img1.load()[i,j]
            rgb2 = img2.load()[i,j]
            # 计算两张图片rbg的差异
            r = abs(rgb1[0] - rgb2[0])
            g = abs(rgb1[1] - rgb2[1])
            b = abs(rgb1[2] - rgb2[2])
            # 判断,如果r、g、b三个的差异都大于60,则判断为缺口位置
            if r>60 and g>60 and b>60:
                return i/2 - 6
# 辅助函数3、生成一个滑块移动的轨迹
def get_tracks(distance):
    # 拖动的时候多拖出去20
    distance += 20
    v = 0
    t = 0.2
    # 定义一个列表,用于存放向前滑动的轨迹(每次向前滑动的距离)
    forwards = []
    # 当前位置
    current = 0
    # 中间位置
    mid = distance*3/5
    while current < distance:
        if current < mid:
            a = 2
        else:
            a = -3
        s = v*t + 0.5*a*(t**2)
        v = a*t + v
        current += s
        forwards.append(round(s))

    return {"forwards":forwards,"backs":[-3,-3,-2,-2,-3,-2,-2,-1,-1,-1]}




# 封装一个函数,用于破解滑动验证码
def crack_code(driver):
    # 1、求滑动距离
    # 1)截取带缺口的图
    img1 = shot_img(driver)
    # 2)去掉缺口
    # 写一个js语句就可以执行然后去缺口
    js = "document.querySelector('.geetest_canvas_slice').style.display='block';document.querySelector('.geetest_canvas_slice').style.zIndex=10;document.querySelector('.geetest_canvas_fullbg').style.display='block';"
    driver.execute_script(js)

    # 3)截取不带缺口的图
    img2 = shot_img(driver)
    # 4)两张图都截取完毕了,接下来要让图片恢复原状
    js = "document.querySelector('.geetest_canvas_slice').style.display='block';document.querySelector('.geetest_canvas_slice').style.zIndex=10;document.querySelector('.geetest_canvas_fullbg').style.display='none';"
    driver.execute_script(js)
    # 5)根据带缺口和不带缺口的两张图来计算法缺口和起始点的距离
    distance = get_distance(img1,img2)
    print(distance)

    # 2、模拟人类的动作来滑动滑块
    # 用ActionChains对象来模拟人类动作

    btn = driver.find_element_by_class_name("geetest_slider_button")
    # 按住按钮
    ActionChains(driver).click_and_hold(btn).perform()
    # 按照一定的轨迹来拖动
    # 向前的轨迹
    tracks = get_tracks(distance)

    for track in tracks["forwards"]:
        ActionChains(driver).move_by_offset(yoffset=0,xoffset=track).perform()
    sleep(0.5)
    for track in tracks["backs"]:
        ActionChains(driver).move_by_offset(yoffset=0,xoffset=track).perform()
    sleep(0.5)
    # 松开手
    ActionChains(driver).release().perform()

# 封装一个函数用于登录
def login_blogs(name,password):
    # 登录页面的接口
    login_page = "https://account.cnblogs.com/signin"
    driver = webdriver.Chrome(executable_path=r"C:\Users\fanjianbo\Desktop\chromedriver_win32\chromedriver.exe")
    try:
        driver.get(login_page)
        sleep(1)
        # 找到相关的表单将用户名和密码输入
        driver.find_element_by_id("LoginName").send_keys(name)
        driver.find_element_by_id("Password").send_keys(password)
        # 点击登录按钮
        driver.find_element_by_class_name("ladda-label").click()

        # 登录按钮按下以后会弹出滑动验证码界面,接下来破解
        crack_code(driver)

        sleep(5)

    finally:
        driver.close()


if __name__ == '__main__':
    login_blogs("qwer","qwer")

2. 然后封装一个函数用于登陆

login .py

import sliderCode
from lxml import etree
from time import sleep
from selenium import webdriver

# 封装一个函数用于登录
def login_weibo(url,name,password):
    driver = webdriver.Chrome(executable_path=r"C:\Users\fanjianbo\Desktop\chromedriver_win32\chromedriver.exe")
    driver.get(url)
    sleep(1)
    driver.find_element_by_id("loginName").send_keys(name)
    driver.find_element_by_id("loginPassword").send_keys(password)
    driver.find_element_by_id("loginAction").click()
    sleep(5)
    try:
        driver.find_element_by_class_name("geetest_radar_tip").click()
        sleep(0.5)
        # 判断是否是滑块验证码
        tree = etree.HTML(driver.page_source)
        slice = tree.xpath("//canvas[starts-with(@class,'geetest_canvas_slice')]")
        if len(slice) == 0:
            # 说明不是滑块验证码
            sleep(10) # 在5s中手工输入
        else:
            # 是滑块验证码
            sliderCode.crack_code(driver)
    except Exception as e:
        print(e)
        print("已经登录成功,无需再次重复验证!")

    # 获取页面的cookie
    cookies = driver.get_cookies()
    driver.quit()
    # print(cookies)
    # 对cookie进行整合
    cookie_list = []
    for cookie in cookies:
        cookie_list.append(str(cookie["name"]) + "=" + str(cookie["value"]))
    return ";".join(cookie_list)

if __name__ == '__main__':
    #登录页面接口
    login_page_url = "https://passport.weibo.cn/signin/login?entry=mweibo&r=https%3A%2F%2Fweibo.cn%2F&backTitle=%CE%A2%B2%A9&vt="
    # 登录
    cookies = login_weibo(login_page_url,"18610593606","a1234567890")
    # print(cookies)
    with open("cookies.txt","w") as fp:
        fp.write(cookies)
3. 爬取微博

weiboSpider .py

import requests
from lxml import etree
from time import sleep
import re
# 1、【数据爬取】
def fetch_pages(url,cookies):
    # 请求头
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',"cookie":cookies}
    # 请求首页
    first_page = requests.get(url=url,headers=headers)
    print(first_page.text)

    # 从first_page中找下级页面的链接
    start = len('<?xml version="1.0" encoding="UTF-8"?>')
    print(first_page.text[start:])
    first_page_tree = etree.HTML(first_page.text[start:])
    sleep(1)
    yield first_page.text[start:]

    next_url = "https://weibo.cn" + first_page_tree.xpath("//div[@id='pagelist']//a[1]/@href")[0]
    next_page = requests.get(url=next_url,headers=headers)
    next_tree = etree.HTML(next_page.text[start:])
    yield next_page.text[start:]
    for i in range(6):
        next_url = "https://weibo.cn" + next_tree.xpath("//div[@id='pagelist']//a[1]/@href")[0]
        next_page = requests.get(url=next_url, headers=headers)
        next_tree = etree.HTML(next_page.text[start:])
        sleep(1)
        # print(next_page)
        yield next_page.text[start:]

# 2、解析模块
def analysis_pages(page_list):
    for page in page_list:
        # print(page)
        page_tree = etree.HTML(page)
        # # 获取页面上的所有的微博
        weibo_list = page_tree.xpath("//div[@class='c' and @id]")
        # 遍历所有的微博,并且分成4类解析
        # 要求如下:
        # 原创不带图(博主、内容、点赞数、转发数、评论数),原创带图(博主、内容、点赞数、转发数、评论数、图片路径) 转发不带图(博主、内容、点赞数、转发数、评论数,转发理由) 转发带图(博主、内容、点赞数、转发数、评论数、图片路径、转发理由)
        for weibo in weibo_list:
            item = {}
            # 按照div的个来区分
            div_list = weibo.xpath("./div")
            num = len(div_list)
            if num == 1:
                # 原创不带图
                item["flag"] = "YN"
                item["author"] = weibo.xpath(".//a[@class='nk']/text()")[0]
                item["content"] = "".join(weibo.xpath(".//span[@class='ctt']//text()"))
                item["dianzan"] = re.findall(pattern=r"[0-9]+",string=weibo.xpath(".//a/text()")[-4])[0]
                item["zhuanfa"] = re.findall(pattern=r"[0-9]+", string=weibo.xpath(".//a/text()")[-3])[0]
                item["pinglun"] = re.findall(pattern=r"[0-9]+", string=weibo.xpath(".//a/text()")[-2])[0]
            elif num==2:
                # 两种情况
                item["author"] = weibo.xpath(".//a[@class='nk']/text()")[0]
                item["content"] = "".join(weibo.xpath(".//span[@class='ctt']//text()"))
                item["dianzan"] = re.findall(pattern=r"[0-9]+", string=weibo.xpath(".//div[2]/a/text()")[-4])[0]
                item["zhuanfa"] = re.findall(pattern=r"[0-9]+", string=weibo.xpath(".//div[2]/a/text()")[-3])[0]
                item["pinglun"] = re.findall(pattern=r"[0-9]+", string=weibo.xpath(".//div[2]/a/text()")[-2])[0]
                # 查找图片
                src = weibo.xpath(".//img[@class='ib']/@src")
                if len(src) == 0:
                    # 转发不带图的
                    item["flag"] = "ZN"
                    item["liyou"] = weibo.xpath(".//div[2]//text()")[1]
                else:
                    # 原创带图
                    item["flag"] = "YP"
                    item["pic"] = src[0]

            else:
                # 转发带图
                item["flag"] = "ZP"
                item["author"] = weibo.xpath(".//a[@class='nk']/text()")[0]
                item["content"] = "".join(weibo.xpath(".//span[@class='ctt']//text()"))
                item["dianzan"] = re.findall(pattern=r"[0-9]+", string=weibo.xpath(".//div[3]/a/text()")[-4])[0]
                item["zhuanfa"] = re.findall(pattern=r"[0-9]+", string=weibo.xpath(".//div[3]/a/text()")[-3])[0]
                item["pinglun"] = re.findall(pattern=r"[0-9]+", string=weibo.xpath(".//div[3]/a/text()")[-2])[0]
                item["liyou"] = weibo.xpath(".//div[3]//text()")[1]
                item["pic"] = weibo.xpath(".//img[@class='ib']/@src")[0]
            print(item)




if __name__ == '__main__':
    # 从本地文件中获取cookies缓存
    with open("./cookies.txt","r") as fp:
        cookies = fp.read()
    print(cookies)
    url = "https://weibo.cn/"
    # 请求
    page_list = fetch_pages(url=url,cookies=cookies)
    analysis_pages(page_list)

当然,最后文件可存储为csv文件

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值