很久以前写的用java爬取豆瓣电影数据的程序,使用httpclient进行网页请求,使用jsoup进行解析,注释很清楚,所以直接上代码:
(1)jar包:
(2)爬取类
package crawl;
import org.apache.http.*;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.util.ArrayList;
public class RetrivePage {
private static CloseableHttpClient httpclient = HttpClients.createDefault(); //创建一个客户端
private static String filename = "d:"+File.separator+File.separator+"liudehua.html"; //定义输出html文件的路径
private static String outfile = "d:"+File.separator+File.separator+"liudehua.csv"; //定义输出csv文件的路径
private static boolean bfile = true; // 定义控制输出file的boolean变量
private static boolean bdb = true; // 定义控制输出file的boolean变量
private static ArrayList<String> datalist = new ArrayList<String>(); //定义Arraylist类集用来保存每一条数据的信息
private static String headtitle = "电影名称,上映时间,导演,演员,评价人数"; //打印的标题头
private static int countrs = 0; //计数变量
/**
* 下载页面
*/
public static String downloadPage(String url) throws Exception {
String htmlString = ""; //定义返回的String变量
HttpGet request = new HttpGet(url); //请求资源
CloseableHttpResponse response = httpclient.execute(request); //得到回应
try {
System.out.println(response.getStatusLine()); //打印状态码
HttpEntity entity = response.getEntity(); //获得Entity对象
htmlString = EntityUtils.t