总结下最近完成的一个爬虫,具体就不说了,代码贴出来,需要的同学可以拿去玩玩。
Foursquare最大的问题是动态网页,就是会所网址不变,但内容在变。这样的化,用Jsoup就无能为力了
因此我使用了Selenium去解决动态网页的问题,代码调试的时候大家记得需要导入这个文件。
单个地点抓取类:
package Test1;
import java.net.UnknownHostException;
import com.mongodb.BasicDBObject;
import com.thoughtworks.selenium.*;
//This is the driver's import. You'll use this for instantiating a
//browser and making it do what you need.
import org.jsoup.Jsoup;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.mongodb.Mongo;
import com.mongodb.MongoException;
import com.mongodb.util.JSON;
import java.util.regex.Pattern;
@SuppressWarnings("deprecation")
public class GetELocation extends SeleneseTestCase {
/*get the information for every location and save it into mongodb*/
public String url;
public Document doc;
public DBCollection collection;
public BasicDBObject document = new BasicDBObject();
public void LinkMongodb() throws Exception {
Mongo mongo = new Mongo("localhost", 27017);
DB db = mongo.getDB("FourS2");
collection = db.getCollection("Foursquare");
System.out.println("Link Mongodb!");
}
public void setUp() throws Exception {
setUp(url, "*firefox");
//opne the international web.
System.out.println("Open firefox!");
}
public void Openurl() throws Exception {
selenium.open(url);
Thread.sleep(30000);
String str = selenium.getHtmlSource();
//get the source of html for the web.
doc = Jsoup.parse(str);
//chang this str into jsoup can read the document.
System.out.println("Open the url and Save doc!");
}
public void Savetitle() throws Exception {
String title = new String(doc.title());
document.put("Title", title);
//get the title of this pages.
System.out.println("Save Title of Page!");
}
public void SaveScore() throws Exception {
Elements Class = doc.select(".rating>span");
String score= new String(Class.text());
document.put("Score", score);