selenium+bs4爬取百度文库
技术难点:
1.百度文库设有反爬机制,使用selenium可爬,
2.百度文库内容是动态加载的,需要模拟滑轮、输入页面搜索进行爬取。
# -*- coding: utf-8 -*-
"""
Created on 2020/4/9 16:48
@author: wk
"""
from selenium import webdriver
from bs4 import BeautifulSoup
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH# 用来居中显示标题
from time import sleep
from selenium.webdriver.common.keys import Keys
# 浏览器安装路径
#BROWSER_PATH=\'C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chromedriver.exe'
#目的URL
DEST_URL='https://wenku.baidu.com/view/06a93103f78a6529647d535f.html?sxts=1587007143237'
#用来保存文档
doc_title = ''
doc_content_list = []
def find_doc(driver, init=True):
global doc_content_list
global doc_title
global page_count
stop_condition = False
if (init is True): # 得到标题
driver.get(DEST_URL)
driver.encoding