selenium-爬取公众号0.2

最新推荐文章于 2024-06-19 12:54:04 发布

噜啦噜啦嘞113

最新推荐文章于 2024-06-19 12:54:04 发布

阅读量277

点赞数 1

分类专栏：爬虫文章标签： selenium 微信 python 爬虫

本文链接：https://blog.csdn.net/qq_45209288/article/details/123999991

版权

爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

过程3

之前写的完成基本90%的功能，但剩下的10%也是比较难搞，主要是对接口请求次数太多账号会被禁用。

demo：

# *coding:utf-8 *.
import random
import time
import json
import pymysql
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from datetime import datetime, timedelta



# 调用环境变量指定的Chrome浏览器创建浏览器对象
driver_path = r'C:\Program Files\Google\Chrome\Application\chromedriver.exe'
driver = webdriver.Chrome(executable_path=driver_path)
# get 方法会一直等到页面被完全加载，才会继续程序
time.sleep(2)
driver.get('https://mp.weixin.qq.com/cgi-bin/home?t=home/index&lang=zh_CN&token=1280197235')
href1 = driver.find_element_by_id('jumpUrl')
ActionChains(driver).click(href1).perform()
time.sleep(20)
href2 = driver.find_element_by_css_selector('#app > div.main_bd_new > div:nth-child(4) > div.weui-desktop-panel__bd > div > div:nth-child(1)')

ActionChains(driver).click(href2).perform()
driver.switch_to.window(driver.window_handles[-1])
time.sleep(2)
btn1 = driver.find_element_by_id('js_editor_insertlink')
ActionChains(driver).click(btn1).perform()
time.sleep(1)
# btn2 = driver.find_element_by_css_selector('//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[4]/div/div/p/div/button')
# ActionChains(driver).click(btn2).perform()
# time.sleep(1)


#暂停进程



#封装数据库写入
def insert_schools_article(title, herf, school_name, date):

    db = pymysql.connect(host="localhost", user="root", password="", database="schools_wenku")
    cursor = db.cursor()
    title = title.replace("'", ' ')
    sql = "insert into schools_article(account_title, official_account_url, school_name, date_issued, collection) values('{}','{}', '{}','{}',0)".format(title, herf, school_name, date)
    print(sql)


    insert_sql = cursor.execute(sql)
    db.commit()
    cursor.close()
    db.close()


    # cursor.execute("select * from schools_article")
    # data = cursor.fetchall()
#封装爬取第一页
def first_page(title_list, school_name):
    for i, temp in enumerate(title_list):
        spans = temp.find_element_by_class_name('inner_link_article_title').find_elements_by_tag_name('span')
        title = spans[1].get_attribute('textContent')
        date = temp.find_element_by_class_name('inner_link_article_date').get_attribute('textContent')
        href = temp.find_element_by_tag_name('a').get_attribute('href')
        print("title:", title)
        print('date:', date)
        print("href:", href)
        insert_schools_article(title, href, school_name=school_name,date=date)
#封装爬取多页
def more_page(school_name,):
    try:
        pages = int(driver.find_element_by_xpath(
            '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/span/label[2]').get_attribute(
            'textContent'))
        print(pages)
    except Exception:
        pass
    finally:

        # print(pages_before)
        # if (pages_before % 20) == 0:
        #     time.sleep(600)
        try:
            tmp = 0
            tmp2 = random.randint(13, 18)
            for x in range(pages-1):
                tmp+=1
                if tmp == tmp2:
                    time.sleep(random.randint(1200, 2400))
                    tmp = 0
                if x > 0:
                    xpth_txt = '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/a[2]'
                    pages_before = int(driver.find_element_by_xpath('//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/span/label[1]').get_attribute(
                        'textContent'))
                else:
                    xpth_txt = '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/a'
                    pages_before = int(driver.find_element_by_xpath(
                        '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/span/label[1]').get_attribute(
                        'textContent'))


                if (pages_before % 15) == 0:
                    time.sleep(random.randint(1200,2400))
                print(pages_before)
                driver.find_element_by_xpath(xpth_txt).click()
                time.sleep(random.randint(5,10))
                title_list = driver.find_elements_by_class_name('inner_link_article_item')
                for i, temp in enumerate(title_list):
                    spans = temp.find_element_by_class_name('inner_link_article_title').find_elements_by_tag_name('span')
                    title = spans[1].get_attribute('textContent')
                    date = temp.find_element_by_class_name('inner_link_article_date').get_attribute('textContent')
                    href = temp.find_element_by_tag_name('a').get_attribute('href')
                    print("title:", title)
                    print('date:', date)
                    print("href:", href)
                    insert_schools_article(title, href, school_name=school_name,date=date)
        except Exception as e:
            print(e)

        # PRINT = i + 1
        # print(page_num)
        # if int(page_num) % 25 == 0:
        #     time.sleep(600)
with open('beishangguangshen_schoollist_01.json','r',encoding='utf8')as fp:
    json_data = json.loads(fp.read())
    print(json_data)
    for fp_list in json_data:
        school_name = fp_list['school_name']
        wechat_official_account_name = fp_list['wechat_official_account_name']
        print(school_name)
        print(wechat_official_account_name)
        butten_other = driver.find_element_by_css_selector('#vue_app > div.weui-desktop-link-dialog > div.weui-desktop-dialog__wrp > div > div.weui-desktop-dialog__bd > div.link_dialog_panel > form:nth-child(1) > div:nth-child(4) > div > div > p > div > button')
        ActionChains(driver).click(butten_other).perform()
        driver.find_element_by_xpath('//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[4]/div/div/div/div/div[1]/span/input').send_keys(wechat_official_account_name)
        time.sleep(random.randint(5,10))
    # driver.find_element_by_xpath('//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[4]/div/div/div/div/div[1]/span/span/button').click()
        driver.find_element_by_xpath('//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[4]/div/div/div/div/div[1]/span/input').send_keys(Keys.ENTER)
        time.sleep(random.randint(5,10))
        driver.find_element_by_css_selector('#vue_app > div.weui-desktop-link-dialog > div.weui-desktop-dialog__wrp > div > div.weui-desktop-dialog__bd > div.link_dialog_panel > form:nth-child(1) > div:nth-child(4) > div > div > div > div.weui-desktop-search__panel > ul > li:nth-child(1) > div.weui-desktop-vm_primary').click()
        time.sleep(random.randint(5,10))
        title_list = driver.find_elements_by_class_name('inner_link_article_item')
    # pages = int(driver.find_element_by_xpath('//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/span/label[2]').get_attribute('textContent'))
        time.sleep(random.randint(15,30))
        first_page(title_list,school_name)

        more_page(school_name)



# def first_page():
#     for i, temp in enumerate(title_list):
#         spans = temp.find_element_by_class_name('inner_link_article_title').find_elements_by_tag_name('span')
#         title = spans[1].get_attribute('textContent')
#         date = temp.find_element_by_class_name('inner_link_article_date').get_attribute('textContent')
#         href = temp.find_element_by_tag_name('a').get_attribute('href')
#         print("title:", title)
#         print('date:', date)
#         print("href:", href)
#
# def more_page():
#     for x in range(pages-1):
#         if x > 0:
#             xpth_txt = '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/a[2]'
#         else:
#             xpth_txt = '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/a'
#         driver.find_element_by_xpath(xpth_txt).click()
#
#         time.sleep(1,10)
#         title_list = driver.find_elements_by_class_name('inner_link_article_item')
#         for i, temp in enumerate(title_list):
#             spans = temp.find_element_by_class_name('inner_link_article_title').find_elements_by_tag_name('span')
#             title = spans[1].get_attribute('textContent')
#             date = temp.find_element_by_class_name('inner_link_article_date').get_attribute('textContent')
#             href = temp.find_element_by_tag_name('a').get_attribute('href')
#             print("title:", title)
#             print('date:', date)
#             print("href:", href)

过程4

这一版基本实现了翻页和不同学校公众号的情况进行处理的功能，也对封禁情况进行处理，不过写的实在是不堪入目，主要是很多循环代码都是重复劳作加上xpth和css_selector实在是太太太太长了，对上面的一些循环和执行步骤进行了封装，再把比较长的xpth和css_selector打包作为常量存起来再引用，外加日志功能，报错的话看一下情况，您在看，哇！优雅！

# *coding:utf-8 *.
import random
import time
import json
import pymysql
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import os
import constant

from datetime import datetime


# 封装数据库写入
def insert_schools_article(title, herf, school_name, date):

    db = pymysql.connect(host="localhost", user="root", password="123456", database="shaodong")
    cursor = db.cursor()
    title = title.replace("'", ' ')
    sql = "insert into t_shaodong2(account_title, official_account_url, school_name, date_issued, collection) values('{}','{}', '{}','{}',0)".format(title, herf, school_name, date)
    cursor.execute(sql)
    db.commit()
    cursor.close()
    db.close()


# 获取本页数据 内容
def obtain_content(title_list, school_name):
    for title in title_list:
        spans = title.find_element(By.CLASS_NAME, 'inner_link_article_title').find_elements(By.TAG_NAME, 'span')
        date = title.find_element(By.CLASS_NAME, 'inner_link_article_date').get_attribute('textContent')
        href = title.find_element(By.TAG_NAME, 'a').get_attribute('href')
        title = spans[1].get_attribute('textContent')
        print("title:", title)
        print('date:', date)
        print("href:", href)
        insert_schools_article(title, href, school_name=school_name, date=date)


def more_page(school_name):
    try:
        pages = int(driver.find_element(By.XPATH, constant.XPATH04).get_attribute('textContent'))
    except Exception as e:
        print(e)
        time.sleep(86400)
        driver.find_element(By.XPATH, constant.XPATH02).send_keys(Keys.ENTER)
        time.sleep(random.randint(5, 10))
    pages = int(driver.find_element(By.XPATH, constant.XPATH04).get_attribute('textContent'))
    for page_num in range(pages):
        time.sleep(random.randint(3, 5))
        # 获取本页数据
        title_list = driver.find_elements(By.CLASS_NAME, 'inner_link_article_item')
        if len(title_list) == 0:
            time.sleep(10)
            print("尝试停10秒看能不能行", '当前时间：', datetime.now())
            title_list = driver.find_elements(By.CLASS_NAME, 'inner_link_article_item')
            print("应该是被封了，等待一天", '当前时间：', datetime.now())
            # 看下日志情况
            for item in logs:
                log = json.loads(item["message"])["message"]
                if "Network.response" in log["method"]:
                    print(log)
            time.sleep(86400)
        else:
            obtain_content(title_list, school_name)

        # 获取当前页数标签
        page_num_tag = driver.find_elements(By.CLASS_NAME, 'weui-desktop-pagination__num')[0].get_attribute('textContent')
        if int(page_num_tag) == 1:
            # 如果当前页数是1 则只有下一页标签没有上一页的
            xpth_txt = constant.XPATH05
        else:
            xpth_txt = constant.XPATH06
        if page_num < pages - 1:
            driver.find_element(By.XPATH, xpth_txt).click()


def iter_file(path, file_format):
    files = []
    for dirpath, dirname, filenames in os.walk(path):
        # print(filenames)
        files = [os.path.join(dirpath, names) for names in filenames if
                 names.endswith(file_format)]

    return files


if __name__ == '__main__':

    # 调用环境变量指定的Chrome浏览器创建浏览器对象
    driver_path = r'C:\Program Files\Google\Chrome\Application\chromedriver.exe'
    driver = webdriver.Chrome(executable_path=driver_path, desired_capabilities=constant.CAPS)
    # get 方法会一直等到页面被完全加载，才会继续程序
    logs = driver.get_log("performance")
    time.sleep(2)
    driver.get(constant.URL)
    href1 = driver.find_element(By.ID, 'jumpUrl')
    ActionChains(driver).click(href1).perform()
    time.sleep(10)
    href2 = driver.find_element(By.CSS_SELECTOR, constant.CSS01)
    ActionChains(driver).click(href2).perform()
    driver.switch_to.window(driver.window_handles[-1])
    time.sleep(2)
    btn1 = driver.find_element(By.ID, 'js_editor_insertlink')
    ActionChains(driver).click(btn1).perform()
    time.sleep(1)

    json_list = iter_file(r'C:\Users\CH\Desktop\school', 'json')
    for json_file in json_list:
        with open(json_file, 'r', encoding='utf8')as fp:
            json_data = json.loads(fp.read())
            # print(json_data)
            for fp_list in json_data:
                school_name = fp_list['school_name']
                wechat_official_account_name = fp_list['wechat_official_account_name']
                print(school_name)
                print(wechat_official_account_name)
                button_other = driver.find_element(By.CSS_SELECTOR, constant.CSS02)
                ActionChains(driver).click(button_other).perform()
                driver.find_element(By.XPATH, constant.XPATH01).send_keys(wechat_official_account_name)
                time.sleep(random.randint(5, 10))
                driver.find_element(By.XPATH, constant.XPATH02).send_keys(Keys.ENTER)
                time.sleep(random.randint(5, 10))
                # 获取ul li标签
                li_tag = driver.find_element(By.XPATH, constant.XPATH03).find_elements(By.TAG_NAME, 'li')
                for i in range(len(li_tag)):
                    li_text = li_tag[i].find_element(By.TAG_NAME, 'strong').get_attribute('textContent')
                    if li_text == wechat_official_account_name:
                        driver.find_element(By.CSS_SELECTOR, constant.CSS03).click()
                        time.sleep(random.randint(5, 10))
                        more_page(school_name)
                        break

常量包

# *coding:utf-8 *
CAPS = {
    'browserName': 'chrome',
    'version': '',
    'platform': 'ANY',
    'goog:loggingPrefs': {'performance': 'ALL'},   # 记录性能日志
    'goog:chromeOptions': {'extensions': [], 'args': ['--headless']}  # 无界面模式
}

URL = 'https://mp.weixin.qq.com/cgi-bin/home?t=home/index&lang=zh_CN&token=1280197235'
# 图文消息
CSS01 = '#app > div.main_bd_new > div:nth-child(4) > div.weui-desktop-panel__bd > div > div:nth-child(1)'
# 选择其他公众号
CSS02 = '#vue_app > div.weui-desktop-link-dialog > div.weui-desktop-dialog__wrp > div > div.weui-desktop-dialog__bd > div.link_dialog_panel > form:nth-child(1) > div:nth-child(4) > div > div > p > div > button'
# 公众号输入框
XPATH01 = '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[4]/div/div/div/div/div[1]/span/input'
# 搜索按钮
XPATH02 = '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[4]/div/div/div/div/div[1]/span/input'
# ul标签
XPATH03 = '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[4]/div/div/div/div[2]/ul'
# 搜索到对应公众号
CSS03 = '#vue_app > div.weui-desktop-link-dialog > div.weui-desktop-dialog__wrp > div > div.weui-desktop-dialog__bd > div.link_dialog_panel > form:nth-child(1) > div:nth-child(4) > div > div > div > div.weui-desktop-search__panel > ul > li:nth-child(1) > div.weui-desktop-vm_primary'
# 获取页码
XPATH04 = '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/span/label[2]'
# 下一页标签
XPATH05 = '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/a'
XPATH06 = '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/a[2]'

写着文章，一不留神号被封了。。。一句话，不要拿自己的爱好挑战别人的饭碗，哈哈~

在这里插入图片描述

也该翻篇了，提升自己才是王道，がんばって康巴得。不过还是希望她的身体难受能快些好，那么瘦一看就是弱身子，还是得多吃肉啊。。。

噜啦噜啦嘞113

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
selenium-爬取公众号0.2

过程3之前写的完成基本90%的功能，但剩下的10%也是比较难搞，主要是对接口请求次数太多账号会被禁用。demo：# *coding:utf-8 *.import randomimport timeimport jsonimport pymysqlfrom selenium import webdriverfrom selenium.webdriver.common.action_chains import ActionChainsfrom selenium.webdriver.commo
复制链接

扫一扫

专栏目录