对于动态加载,Selenium+Phantomjs的强大打开网页查看网页源码(注意不是检查元素)会发现要爬取的信息并不在源码里面。Selenium+Phantomjs的强大一方面就在于能将完整的源码抓取到,也就是说,从网页源码无法通过解析得到数据。
# -*- coding: utf-8 -*- import xlsxwriter import sys reload(sys) sys.setdefaultencoding( "utf-8" ) from selenium import webdriver from bs4 import BeautifulSoup def get_grade(url): print(url) #匿名爬虫 #假定9999端口开启tor服务 service_args = ['--proxy=localhost:9999', '--proxy-type=socks5', ] driver = webdriver.PhantomJS(executable_path=r"C:\Users\Administrator\Desktop\phantomjs-1.9.7-windows\phantomjs.exe") driver.get(url) data = driver.page_source # print(data) soup = BeautifulSoup(data, 'lxml') grades = soup.find_all('tr') for grade in grades: global i if '<td>' in str(grade): i += 1 print(i) grade_text =grade.get_text() print(grade_text) grade_text = str(grade_text) city = grade_text[:-13] worksheet.write(i,0,city) time = grade_text[-13:-9] worksheet.write(i,1,time) subs = grade_text[-9:-7] worksheet.write(i,2,subs) s = grade_text[-7:-3] worksheet.write(i,3,s) grade = grade_text[-3:] worksheet.write(i,4,grade) i = -1 workbook