需求:动态爬取CSDN下载资源信息和评论
步骤:1.获取页面总数,通过getPage()函数实现;
2.获取每个下载页面的URL;
3.根据步骤2获取资源的URL,爬取下载信息和该页评论信息(翻页没有做)。
'''
Created on 2017年12月15日
@filename: getCSDN.py
@author: geng
'''
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.action_chains import ActionChains
import re
import time
import os
# 打开Firefox浏览器 设定等待加载时间 访问URL
driver = webdriver.PhantomJS(executable_path = "phantomjs")
driver_detial = webdriver.PhantomJS(executable_path="phantomjs")
wait = ui.WebDriverWait(driver, 3)
driver.get("http://download.csdn.net/user/eastmount/uploads")
SUMRESOURCES = 0 # 全局变量, 记录资源总数(尽量避免)
# 获取列表页数
texts = driver.find_element_by_xpath("/html/body/div[3]/div/div[2]/div[1]/div/div/div").text
print(texts)
m = re.findall(r'[0-9]+', texts)
print('页数:', m[1])
return int(m[1])
# 获取URL和文章标题
def getUrl_Title(num):
global SUMRESOURCES
url = "http://download.csdn.net/user/eastmount/uploads/" + str(num)
print("下载列表URL : " + url)
driver.get(url)
wait.until(lambda driver: driver.find_element_by_xpath("/html/body/div[3]/div/div[2]/div[1]/div/div/div"))
list_container = driver.find_elements_by_xpath("//div[@class='content']/h3/a")
for content in list_container:
print("NO." + str(SUMRESOURCES + 1))
print("标题: " + content.text)
print("连接: " + content.get_attribute('href'))
SUMRESOURCES += 1
getDetial(content.get_attribute('href'))
else:
print()
# 获取详细信息 因前定义的driver正在使用中 故调用driver_detail
# 否则报错 Message: Error Message => 'Element does not exist in cache'
def getDetial(url):
driver_detial.get(url)
detial = driver_detial.find_element_by_xpath("//div[@class='dl_operate clearfix']").text
downloadInfo = driver_detial.find_element_by_xpath("//div[@class='dl_download']").text.split("\n")[0]
print(detial, downloadInfo)
comments = driver_detial.find_elements_by_xpath("//*[@id='p_']/dl/dd[1]")
for comment in comments:
print("评论:", comment.text)
else:
print()
# getUrl_Title(1)
# getDetial("http://download.csdn.net/download/eastmount/9788218")
def main():
start = time.clock()
pageNum = getPage()
i = 1
while(i <= pageNum):
getUrl_Title(i)
i += 1
else:
print("SUMRESOURCES: ", SUMRESOURCES)
print("Load Over")
end = time.clock()
print("Time:{}s".format(end - start))
if __name__ == "__main__":
main()
运行结果如下:
总结:
Phantomjs的速度相对于BeautifulSoup而言很慢,但是BeautifulSoup只能获取的HTML源码,并没有JS信息,这就体现了Phantomjs动态加载的优势。