过程3
之前写的完成基本90%的功能,但剩下的10%也是比较难搞,主要是对接口请求次数太多账号会被禁用。
demo:
# *coding:utf-8 *.
import random
import time
import json
import pymysql
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from datetime import datetime, timedelta
# 调用环境变量指定的Chrome浏览器创建浏览器对象
driver_path = r'C:\Program Files\Google\Chrome\Application\chromedriver.exe'
driver = webdriver.Chrome(executable_path=driver_path)
# get 方法会一直等到页面被完全加载,才会继续程序
time.sleep(2)
driver.get('https://mp.weixin.qq.com/cgi-bin/home?t=home/index&lang=zh_CN&token=1280197235')
href1 = driver.find_element_by_id('jumpUrl')
ActionChains(driver).click(href1).perform()
time.sleep(20)
href2 = driver.find_element_by_css_selector('#app > div.main_bd_new > div:nth-child(4) > div.weui-desktop-panel__bd > div > div:nth-child(1)')
ActionChains(driver).click(href2).perform()
driver.switch_to.window(driver.window_handles[-1])
time.sleep(2)
btn1 = driver.find_element_by_id('js_editor_insertlink')
ActionChains(driver).click(btn1).perform()
time.sleep(1)
# btn2 = driver.find_element_by_css_selector('//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[4]/div/div/p/div/button')
# ActionChains(driver).click(btn2).perform()
# time.sleep(1)
#暂停进程
#封装数据库写入
def insert_schools_article(title, herf, school_name, date):
db = pymysql.connect(host="localhost", user="root", password="", database="schools_wenku")
cursor = db.cursor()
title = title.replace("'", ' ')
sql = "insert into schools_article(account_title, official_account_url, school_name, date_issued, collection) values('{}','{}', '{}','{}',0)".format(title, herf, school_name, date)
print(sql)
insert_sql = cursor.execute(sql)
db.commit()
cursor.close()
db.close()
# cursor.execute("select * from schools_article")
# data = cursor.fetchall()
#封装爬取第一页
def first_page(title_list, school_name):
for i, temp in enumerate(title_list):
spans = temp.find_element_by_class_name('inner_link_article_title').find_elements_by_tag_name('span')
title = spans[1].get_attribute('textContent')
date = temp.find_element_by_class_name('inner_link_article_date').get_attribute('textContent')
href = temp.find_element_by_tag_name('a').get_attribute('href')
print("title:", title)
print('date:', date)
print("href:", href)
insert_schools_article(title, href, school_name=school_name,date=date)
#封装爬取多页
def more_page(school_name,):
try:
pages = int(driver.find_element_by_xpath(
'//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/span/label[2]').get_attribute(
'textContent'))
print(pages)
except Exception:
pass
finally:
# print(pages_before)
# if (pages_before % 20) == 0:
# time.sleep(600)
try:
tmp = 0
tmp2 = random.randint(13, 18)
for x in range(pages-1):
tmp+=1
if tmp == tmp2:
time.sleep(random.randint(1200, 2400))
tmp = 0
if x > 0:
xpth_txt = '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/a[2]'
pages_before = int(driver.find_element_by_xpath('//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/span/label[1]').get_attribute(
'textContent'))
else:
xpth_txt = '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/a'
pages_before = int(driver.find_element_by_xpath(
'//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/span/label[1]').get_attribute(
'textContent'))
if (pages_before % 15) == 0:
time.sleep(random.randint(1200,2400))
print(pages_before)
driver.find_element_by_xpath(xpth_txt).click()
time.sleep(random.randint(5,10))
title_list = driver.find_elements_by_class_name('inner_link_article_item')
for i, temp in enumerate(title_list):
spans = temp.find_element_by_class_name('inner_link_article_title').find_elements_by_tag_name('span')
title = spans[1].get_attribute('textContent')
date = temp.find_element_by_class_name('inner_link_article_date').get_attribute('textContent')
href = temp.find_element_by_tag_name('a').get_attribute('href')
print("title:", title)
print('date:', date)
print("href:", href)
insert_schools_article(title, href, school_name=school_name,date=date)
except Exception as e:
print(e)
# PRINT = i + 1
# print(page_num)
# if int(page_num) % 25 == 0:
# time.sleep(600)
with open('beishangguangshen_schoollist_01.json','r',encoding='utf8')as fp:
json_data = json.loads(fp.read())
print(json_data)
for fp_list in json_data:
school_name = fp_list['school_name']
wechat_official_account_name = fp_list['wechat_official_account_name']
print(school_name)
print(wechat_official_account_name)
butten_other = driver.find_element_by_css_selector('#vue_app > div.weui-desktop-link-dialog > div.weui-desktop-dialog__wrp > div > div.weui-desktop-dialog__bd > div.link_dialog_panel > form:nth-child(1) > div:nth-child(4) > div > div > p > div > button')
ActionChains(driver).click(butten_other).perform()
driver.find_element_by_xpath('//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[4]/div/div/div/div/div[1]/span/input').send_keys(wechat_official_account_name)
time.sleep(random.randint(5,10))
# driver.find_element_by_xpath('//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[4]/div/div/div/div/div[1]/span/span/button').click()
driver.find_element_by_xpath('//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[4]/div/div/div/div/div[1]/span/input').send_keys(Keys.ENTER)
time.sleep(random.randint(5,10))
driver.find_element_by_css_selector('#vue_app > div.weui-desktop-link-dialog > div.weui-desktop-dialog__wrp > div > div.weui-desktop-dialog__bd > div.link_dialog_panel > form:nth-child(1) > div:nth-child(4) > div > div > div > div.weui-desktop-search__panel > ul > li:nth-child(1) > div.weui-desktop-vm_primary').click()
time.sleep(random.randint(5,10))
title_list = driver.find_elements_by_class_name('inner_link_article_item')
# pages = int(driver.find_element_by_xpath('//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/span/label[2]').get_attribute('textContent'))
time.sleep(random.randint(15,30))
first_page(title_list,school_name)
more_page(school_name)
# def first_page():
# for i, temp in enumerate(title_list):
# spans = temp.find_element_by_class_name('inner_link_article_title').find_elements_by_tag_name('span')
# title = spans[1].get_attribute('textContent')
# date = temp.find_element_by_class_name('inner_link_article_date').get_attribute('textContent')
# href = temp.find_element_by_tag_name('a').get_attribute('href')
# print("title:", title)
# print('date:', date)
# print("href:", href)
#
# def more_page():
# for x in range(pages-1):
# if x > 0:
# xpth_txt = '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/a[2]'
# else:
# xpth_txt = '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/a'
# driver.find_element_by_xpath(xpth_txt).click()
#
# time.sleep(1,10)
# title_list = driver.find_elements_by_class_name('inner_link_article_item')
# for i, temp in enumerate(title_list):
# spans = temp.find_element_by_class_name('inner_link_article_title').find_elements_by_tag_name('span')
# title = spans[1].get_attribute('textContent')
# date = temp.find_element_by_class_name('inner_link_article_date').get_attribute('textContent')
# href = temp.find_element_by_tag_name('a').get_attribute('href')
# print("title:", title)
# print('date:', date)
# print("href:", href)
过程4
这一版基本实现了翻页和不同学校公众号的情况进行处理的功能,也对封禁情况进行处理,不过写的实在是不堪入目,主要是很多循环代码都是重复劳作加上xpth和css_selector实在是太太太太长了,对上面的一些循环和执行步骤进行了封装,再把比较长的xpth和css_selector打包作为常量存起来再引用,外加日志功能,报错的话看一下情况,您在看,哇!优雅!
# *coding:utf-8 *.
import random
import time
import json
import pymysql
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import os
import constant
from datetime import datetime
# 封装数据库写入
def insert_schools_article(title, herf, school_name, date):
db = pymysql.connect(host="localhost", user="root", password="123456", database="shaodong")
cursor = db.cursor()
title = title.replace("'", ' ')
sql = "insert into t_shaodong2(account_title, official_account_url, school_name, date_issued, collection) values('{}','{}', '{}','{}',0)".format(title, herf, school_name, date)
cursor.execute(sql)
db.commit()
cursor.close()
db.close()
# 获取本页数据 内容
def obtain_content(title_list, school_name):
for title in title_list:
spans = title.find_element(By.CLASS_NAME, 'inner_link_article_title').find_elements(By.TAG_NAME, 'span')
date = title.find_element(By.CLASS_NAME, 'inner_link_article_date').get_attribute('textContent')
href = title.find_element(By.TAG_NAME, 'a').get_attribute('href')
title = spans[1].get_attribute('textContent')
print("title:", title)
print('date:', date)
print("href:", href)
insert_schools_article(title, href, school_name=school_name, date=date)
def more_page(school_name):
try:
pages = int(driver.find_element(By.XPATH, constant.XPATH04).get_attribute('textContent'))
except Exception as e:
print(e)
time.sleep(86400)
driver.find_element(By.XPATH, constant.XPATH02).send_keys(Keys.ENTER)
time.sleep(random.randint(5, 10))
pages = int(driver.find_element(By.XPATH, constant.XPATH04).get_attribute('textContent'))
for page_num in range(pages):
time.sleep(random.randint(3, 5))
# 获取本页数据
title_list = driver.find_elements(By.CLASS_NAME, 'inner_link_article_item')
if len(title_list) == 0:
time.sleep(10)
print("尝试停10秒看能不能行", '当前时间:', datetime.now())
title_list = driver.find_elements(By.CLASS_NAME, 'inner_link_article_item')
print("应该是被封了,等待一天", '当前时间:', datetime.now())
# 看下日志情况
for item in logs:
log = json.loads(item["message"])["message"]
if "Network.response" in log["method"]:
print(log)
time.sleep(86400)
else:
obtain_content(title_list, school_name)
# 获取当前页数标签
page_num_tag = driver.find_elements(By.CLASS_NAME, 'weui-desktop-pagination__num')[0].get_attribute('textContent')
if int(page_num_tag) == 1:
# 如果当前页数是1 则只有下一页标签没有上一页的
xpth_txt = constant.XPATH05
else:
xpth_txt = constant.XPATH06
if page_num < pages - 1:
driver.find_element(By.XPATH, xpth_txt).click()
def iter_file(path, file_format):
files = []
for dirpath, dirname, filenames in os.walk(path):
# print(filenames)
files = [os.path.join(dirpath, names) for names in filenames if
names.endswith(file_format)]
return files
if __name__ == '__main__':
# 调用环境变量指定的Chrome浏览器创建浏览器对象
driver_path = r'C:\Program Files\Google\Chrome\Application\chromedriver.exe'
driver = webdriver.Chrome(executable_path=driver_path, desired_capabilities=constant.CAPS)
# get 方法会一直等到页面被完全加载,才会继续程序
logs = driver.get_log("performance")
time.sleep(2)
driver.get(constant.URL)
href1 = driver.find_element(By.ID, 'jumpUrl')
ActionChains(driver).click(href1).perform()
time.sleep(10)
href2 = driver.find_element(By.CSS_SELECTOR, constant.CSS01)
ActionChains(driver).click(href2).perform()
driver.switch_to.window(driver.window_handles[-1])
time.sleep(2)
btn1 = driver.find_element(By.ID, 'js_editor_insertlink')
ActionChains(driver).click(btn1).perform()
time.sleep(1)
json_list = iter_file(r'C:\Users\CH\Desktop\school', 'json')
for json_file in json_list:
with open(json_file, 'r', encoding='utf8')as fp:
json_data = json.loads(fp.read())
# print(json_data)
for fp_list in json_data:
school_name = fp_list['school_name']
wechat_official_account_name = fp_list['wechat_official_account_name']
print(school_name)
print(wechat_official_account_name)
button_other = driver.find_element(By.CSS_SELECTOR, constant.CSS02)
ActionChains(driver).click(button_other).perform()
driver.find_element(By.XPATH, constant.XPATH01).send_keys(wechat_official_account_name)
time.sleep(random.randint(5, 10))
driver.find_element(By.XPATH, constant.XPATH02).send_keys(Keys.ENTER)
time.sleep(random.randint(5, 10))
# 获取ul li标签
li_tag = driver.find_element(By.XPATH, constant.XPATH03).find_elements(By.TAG_NAME, 'li')
for i in range(len(li_tag)):
li_text = li_tag[i].find_element(By.TAG_NAME, 'strong').get_attribute('textContent')
if li_text == wechat_official_account_name:
driver.find_element(By.CSS_SELECTOR, constant.CSS03).click()
time.sleep(random.randint(5, 10))
more_page(school_name)
break
常量包
# *coding:utf-8 *
CAPS = {
'browserName': 'chrome',
'version': '',
'platform': 'ANY',
'goog:loggingPrefs': {'performance': 'ALL'}, # 记录性能日志
'goog:chromeOptions': {'extensions': [], 'args': ['--headless']} # 无界面模式
}
URL = 'https://mp.weixin.qq.com/cgi-bin/home?t=home/index&lang=zh_CN&token=1280197235'
# 图文消息
CSS01 = '#app > div.main_bd_new > div:nth-child(4) > div.weui-desktop-panel__bd > div > div:nth-child(1)'
# 选择其他公众号
CSS02 = '#vue_app > div.weui-desktop-link-dialog > div.weui-desktop-dialog__wrp > div > div.weui-desktop-dialog__bd > div.link_dialog_panel > form:nth-child(1) > div:nth-child(4) > div > div > p > div > button'
# 公众号输入框
XPATH01 = '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[4]/div/div/div/div/div[1]/span/input'
# 搜索按钮
XPATH02 = '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[4]/div/div/div/div/div[1]/span/input'
# ul标签
XPATH03 = '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[4]/div/div/div/div[2]/ul'
# 搜索到对应公众号
CSS03 = '#vue_app > div.weui-desktop-link-dialog > div.weui-desktop-dialog__wrp > div > div.weui-desktop-dialog__bd > div.link_dialog_panel > form:nth-child(1) > div:nth-child(4) > div > div > div > div.weui-desktop-search__panel > ul > li:nth-child(1) > div.weui-desktop-vm_primary'
# 获取页码
XPATH04 = '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/span/label[2]'
# 下一页标签
XPATH05 = '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/a'
XPATH06 = '//*[@id="vue_app"]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[5]/div/div/div[3]/span[1]/a[2]'
写着文章,一不留神号被封了。。。一句话,不要拿自己的爱好挑战别人的饭碗,哈哈~
也该翻篇了,提升自己才是王道,がんばって康巴得。不过还是希望她的身体难受能快些好,那么瘦一看就是弱身子,还是得多吃肉啊。。。