执行代码下边:
#改demo为渲染js网页,使用selenium和firefox的无头模式完成网页的爬取。需要注意,本地必须有firefox以及firefox的selenium插件geckodriver
from bs4 import BeautifulSoup
import time
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as expected #这个和下面那个webDriverWait是配合使用的,用来完成等待js加载完成
from selenium.webdriver.support.wait import WebDriverWait
#获取到每页的职位url
def getListUrl(driver,url,pageNumber):
urlList = set()
driver.get(url)
time.sleep(5)
html = driver.page_source
try:
for i in range(0,pageNumber):
print(i)
bs = BeautifulSoup(html,features="lxml")
for a in bs.select(".contentpile")[0].select("a"):
if a.get("href").find("jobs") != -1:
urlList.add(a.get("href"))
driver.find_elements_by_tag_name("button")[2].click()
time.sleep(5)
html = driver.page_source
except Exception:
return urlList
return urlList
# driver.
option = Options()
option.add_argument("--headless") #加入无头参数,所谓无头参数就是不开启窗口
baseUrl = "https://sou.zhaopin.com/?jl=489&kw=大数据开发&kt=33"
url1 = "https://www.zhaopin.com"
driver = Firefox(executable_path="/usr/lib/geckodriver",firefox_options=option)
urls = getListUrl(driver,baseUrl,65)
print(len(urls))
for url in urls:
print(url)
driver.close()
================================================
执行结果报错!报错!报错!
/home/hairui/PycharmProjects/pythonPractice/venv/bin/python /home/hairui/PycharmProjects/pythonPractice/venv/Spider_Demo.py
Traceback (most recent call last):
File "/home/hairui/PycharmProjects/pythonPractice/venv/lib/python3.5/site-packages/selenium/webdriver/common/service.py", line 76, in start
stdin=PIPE)
File "/usr/lib/python3.5/subprocess.py", line 947, in __init__
restore_signals, start_new_session)
File "/usr/lib/python3.5/subprocess.py", line 1551, in _execute_child
raise child_exception_type(errno_num, err_msg)
PermissionError: [Errno 13] Permission denied
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/hairui/PycharmProjects/pythonPractice/venv/Spider_Demo.py", line 46, in <module>
driver = Firefox(executable_path="/usr/lib/geckodriver",firefox_options=option)
File "/home/hairui/PycharmProjects/pythonPractice/venv/lib/python3.5/site-packages/selenium/webdriver/firefox/webdriver.py", line 164, in __init__
self.service.start()
File "/home/hairui/PycharmProjects/pythonPractice/venv/lib/python3.5/site-packages/selenium/webdriver/common/service.py", line 88, in start
os.path.basename(self.path), self.start_error_message)
selenium.common.exceptions.WebDriverException: Message: 'geckodriver' executable may have wrong permissions.
Process finished with exit code 1