今天用selenium和casperjs2种对https://class.coursera.org/nlp/lecture网站的ppt、pdf、srt、MP4的下载地址进行数据抓取 1、pythonselenium #!/usr/bin/python# -*- coding: utf-8 -*-from selenium import webdriverfrom bs4 import BeautifulSoupimport t
今天用selenium和casperjs2种对https://class.coursera.org/nlp/lecture网站的ppt、pdf、srt、MP4的下载地址进行数据抓取
1、python+selenium
#!/usr/bin/python
# -*- coding: utf-8 -*-
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def catchDate(s):
"""页面数据提取"""
soup = BeautifulSoup(s)
z = []
m = soup.findAll("ul",class_="course-item-list-div-list")
for obj in m:
try:
print obj.previous_sibling.find('h3').get_text()
tmp = obj.findAll('li', class_="unviewed")
for eachli in tmp:
titleli = eachli.find('a').get_text()
print ' '+titleli
allaInEachDiv = eachli.find('div', class_="course-lecture-item-resource").findAll('a')
for eacha in allaInEachDiv:
print ' '+eacha['href']
except Exception, e:
continue
if(tmp != ""):
z.append(tmp)
return z
starttime = time.time()
driver = webdriver.PhantomJS(executable_path='C:\phantomjs-1.9.7-windows\phantomjs.exe')
driver.get("https://class.coursera.org/nlp/lecture")
html = driver.page_source
content = catchDate(html)
endtime = time.time()
print endtime - starttime
driver.quit
2、casperjs
var casper = require("casper").create({
clientScripts: ["jquery-1.7.js"],
stepTimeout: 120 * 1000,
pageSettings: {
loadImages: false
},
verbose: true,
logLevel: "error"
});
var numberOfLinks = 0;
var fs = require('fs');
var filename = 'content.txt';
var fullContent = "";
var startTime = new Date(), endTime;
casper.start("https://class.coursera.org/nlp/lecture", function() {
numberOfLinks = this.evaluate(function() {
return __utils__.findAll('.course-item-list-div-list').length;
});
this.echo(numberOfLinks + " items found");
});
getStartTime = function(){
this.echo(startTime);
this.then(getcontent);
};
getcontent = function() {
fullContent = this.evaluate(function() {
var content = "";
jQuery('.course-item-list-div-list').each(function() {
var btitle = $(this).prev().find("h3").text();
content += btitle + '\r\n';
$(this).find("li").each(function(){
var stitle = $(this).find("a").first().text();
content += stitle + '\r';
$(this).find("div a").each(function(){
content += $(this).attr("href")+'\r';
});
content += '\r\n';
});
content += '\r\n\r\n';
});
return content;
});
this.then(writefile);
};
writefile = function() {
this.echo('writing to ' + filename);
fs.write(filename, fullContent, 'w');
this.then(getEndTime);
};
getEndTime = function(){
endTime = new Date();
}
casper.then(getStartTime);
casper.then(function exitSystem() {
this.echo(new Date() - startTime);
casper.exit();
});
casper.run();
因为不熟练,感觉写的不太好,求大神对方法进行指导!!!
参考:
https://gist.github.com/imjared/5201405
http://casperjs.readthedocs.org/en/latest/modules/casper.html#evaluate
http://blog.csdn.net/u012577500/article/details/18185399
http://stackoverflow.com/questions/14894311/casperjs-windows-installation-how-is-it-done-the-correct-way-please
http://blog.csdn.net/sagomilk/article/details/20800543
本条技术文章来源于互联网,如果无意侵犯您的权益请点击此处反馈版权投诉
本文系统来源:php中文网