下面附上我的代码。求教一下。 我该考虑的东西都考虑 是加载的时候认为我是爬虫么?
from selenium import webdriver
import time
from pyquery import PyQuery
import re
import os
import csv
def scrollTo_text(response):
# 文章滚动滚轮方法
flag = 20
i = 1
res = PyQuery(response)
mes = res(’.cnt-list’)
while 1:
# retry_t = 1
if i*flag >= len(mes):
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(2)
response = browser.page_source
res = PyQuery(response)
mes = res('.cnt-list')
if (i*flag-20) < len(mes):
print('加载成功 ' + str(len(mes)))
pass
elif (i*flag-20) >= len(mes):
print('加载失败,重试中' + str(len(mes)))
time.sleep(5)
response = browser.page_source
res = PyQuery(response)
mes = res('.cnt-list')
if (i*flag-20) < len(mes):
# print((i*flag-20) , len(mes))
print('重试成功' + str(len(mes)))
pass
else:
print('全部加载完成' + str(len(mes)))
break
# if i*flag > (len(mes)+40) and retry_t>1:
# break
# else:
# time.sleep(5)
# retry_t +=1
else:
print(i*flag , len(mes))
break
i += 1
def scrollTo_video(response):
# 视频滚动滚轮方法
flag = 10
i = 1
res = PyQuery(response)
mes = res(’.largevideo’)
while 1:
retry_v = 1
if i*flag >= len(mes):
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(1)
response = browser.page_source
res = PyQuery(response)
mes = res('.largevideo')
if (i*flag) < len(mes):
print('加载成功 ' + str(len(mes)))
pass
elif (i*flag) >= len(mes):
print('加载失败,重试中 ' + str(len(mes)))
time.sleep(5)
response = browser.page_source
res = PyQuery(response)
mes = res('.cnt-list.largevideo')
if (i*flag) < len(mes):
# print(i*flag , len(mes))
print('重试成功 ' + str(len(mes)))
# browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
# time.sleep(1)
pass
else:
print('全部加载完成 ' + str(len(mes)))
break
# if i*flag > (len(mes)+10):
# # print(i*flag , len(mes))
# break
else:
print(i*flag , len(mes))
break
i += 1
def get_complete_text_mes(complete_text_response, author_name):
# 获取文章信息
res = PyQuery(complete_text_response)
text_message = res(’.bBor’).items()
# 构成文件位置
local = ‘用户信息/’+ now_time + ‘/’ + author_name + ‘.csv’
fieldnames = [‘type’, ‘title’, ‘reading’, ‘url’, ‘upload_time’]
with open(local, 'a', encoding='utf-8',newline='') as f:
f_csv = csv.DictWriter(f, fieldnames=fieldnames)
f_csv.writeheader()
for message in text_message:
mes = str(message)
item = {
# 文章url
'url' : re.findall('.*?<a href="(.*?)"', mes)[0],
# 文章名字
'title' : re.findall('.*?<h2>(.*?)</h2>', mes)[0],
# 文字阅读量
'reading' : message('.pv').text(),
'upload_time': message('.time').text(),
'type': '文章'
}
# 写入文章信息
with open(local, 'a', newline='', encoding='utf-8') as f:
f_csv = csv.DictWriter(f, fieldnames=fieldnames)
f_csv.writerow(item)
print('获取文章完成')
def get_complete_video_mes(complete_video_response):
# 获取 video 信息
video_complete_response = PyQuery(complete_video_response)
video_message = video_complete_response(’.largevideo-wrapper’).items()
# 构成文件位置
local = ‘用户信息/’+ now_time + ‘/’ + author_name + ‘.csv’
fieldnames = [‘type’, ‘title’, ‘reading’, ‘url’]
for video_single_mes in video_message:
item={
'title' : video_single_mes('.largevideo-title').text(),
'reading' : video_single_mes('.pv').text(),
'url' : video_single_mes(".largevideo-box").attr('data-src'),
'type': '视频'
}
# 写入视频信息
with open(local, 'a', newline='', encoding='utf-8') as f:
f_csv = csv.DictWriter(f, fieldnames=fieldnames)
f_csv.writerow(item)
print('获取视频完成')
if name == ‘main’:
browser = webdriver.Chrome(executable_path=’./chromedriver.exe’)
# 打开登陆页面
browser.get(‘https://baijiahao.baidu.com/builder/author/register/index’)
# 等待用户输入用户名密码时间(秒)
time.sleep(60)
# 导入txt文件
with open(’./百家号导入链接.txt’, ‘r’) as f:
url_list = f.read().splitlines()
now_time = time.strftime(’%Y-%m-%d’,time.localtime(time.time()))
# 新建文件夹
bace_local = “用户信息/”+ now_time
if not os.path.exists(‘用户信息’):
os.mkdir(‘用户信息’)
if not os.path.exists(bace_local):
os.mkdir(bace_local)
for i in url_list:
browser.get(i)
time.sleep(1)
# 获取初始页面源码
text_response = browser.page_source
get_name = PyQuery(text_response)
# 获取作者名
author_name = get_name('.name:first-child').text()
print('\n作者: ' + author_name)
local = '用户信息/'+ now_time +'/'+ author_name + '.csv'
# 判断是否已下载
if not os.path.exists(local):
print('\n获取完整文章信息中...')
# 文章滚动
scrollTo_text(text_response)
# 获取完整文章
text_response = browser.page_source
# 获取文章信息
get_complete_text_mes(text_response,author_name)
# 获取视频信息(要滚回最上边)
time.sleep(1)
browser.execute_script('window.scrollTo(0, 0)')
# 定位视频选项位置
video = browser.find_element_by_xpath('//*[@id="tab"]/div[1]/div/div[1]/div[3]')
# 点击
video.click()
time.sleep(1)
# 获取初始视频源码
print('\n获取完整视频信息中...')
video_bace_response = browser.page_source
# 视频滚动
scrollTo_video(video_bace_response)
# 获取完整视频源码
vider_complete_response = browser.page_source
# 获取视频信息
get_complete_video_mes(vider_complete_response)
else:
print('今日已下载过 ' + author_name + ' 的所有信息')
browser.close()
非常感谢大佬回复下 。。。 大神留下你的QQ或者微信 沟通下。。