packagespider;importjava.io.BufferedReader;importjava.io.File;importjava.io.FileWriter;importjava.io.IOException;importjava.io.InputStream;importjava.io.InputStreamReader;importjava.io.PrintWriter;importnet.sf.json.JSONArray;importnet.sf.json.JSONObject;importorg.apache.http.HttpEntity;importorg.apache.http.HttpResponse;importorg.apache.http.client.ClientProtocolException;importorg.apache.http.client.HttpClient;importorg.apache.http.client.methods.HttpGet;importorg.apache.http.client.methods.HttpUriRequest;importorg.apache.http.client.methods.RequestBuilder;importorg.apache.http.impl.client.DefaultHttpClient;importorg.jsoup.Jsoup;importorg.jsoup.nodes.Document;importorg.jsoup.nodes.Element;importorg.jsoup.select.Elements;importorg.junit.Test;
@SuppressWarnings("deprecation")public classZhihuSpider {/*** 模拟HTTP GET请求*/
public String doGet() throwsClientProtocolException, IOException {
String str= "";//创建HttpClient实例
HttpClient httpClient = newDefaultHttpClient();//创建Get方法实例
HttpUriRequest httpUriRequest = new HttpGet("http://www.zhihu.com");//添加必要的头信息
httpUriRequest
.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0");
httpUriRequest
.setHeader("Cookie","这里的Cookie拷贝复制登录后请求头里的Cookie值");
httpUriRequest.setHeader("DNT", "1");
httpUriRequest.setHeader("Connection", "keep-alive");
httpUriRequest.setHeader("Upgrade-Insecure-Requests", "1");
httpUriRequest.setHeader("Cache-Control", "max-age=0");
HttpResponse response=httpClient.execute(httpUriRequest);
HttpEntity entity=response.getEntity();if (entity != null) {
InputStream inputStream=entity.getContent();
str=convertStreamToString(inputStream);
}returnstr;
}public staticString convertStreamToString(InputStream is)throwsIOException {
InputStreamReader ir= new InputStreamReader(is, "UTF8");
BufferedReader reader= newBufferedReader(ir);
StringBuilder sb= newStringBuilder();
String line= null;try{while ((line = reader.readLine()) != null) {
sb.append(line+ "\n");
}
}catch(IOException e) {
e.printStackTrace();
}finally{try{
is.close();
}catch(IOException e) {
e.printStackTrace();
}
}returnsb.toString();
}//下载 URL 指向的网页
@SuppressWarnings("static-access")
@Testpublic void downloadFile() throwsException {//模拟HTTP GET请求
String responseBody =doGet();//解析数据
String writeStr =unparsedData(responseBody);//创建新文件
String path = "D:\\testFile\\zhihu.txt";
PrintWriter printWriter= null;
printWriter= new PrintWriter(new FileWriter(newFile(path)));//写内容
printWriter.write(writeStr);
printWriter.close();int offset = 10;int start = 9;for (int time = 0; time <= 100; time++) {//模拟POST请求
JSONObject jsonObject =JSONObject
.fromObject(doPost(offset, start));//解析数据(只拿JSON数据里的msg数组)
String addWriteStr = "";
JSONArray jsonArray= jsonObject.getJSONArray("msg");
Object[] arrays=jsonArray.toArray();for(Object array : arrays) {
addWriteStr+=unparsedData(array.toString());
}//追加文本
printWriter = new PrintWriter(new FileWriter(path, true));
printWriter.write(addWriteStr);
printWriter.close();//延时,调整参数
Thread.currentThread().sleep(1000);//毫秒
offset = offset + 10;
start= start + 10;
}
}/*** 根据HTML解析数据
*
*@paramhtml
* 源HTML
*@return解析后的数据*/
publicString unparsedData(String html) {
Document doc=Jsoup.parse(html);
Elements feeds= doc.getElementsByAttributeValue("class","feed-item-inner");
String writeStr= "";for(Element feed : feeds) {
Elements title= newElements();
Elements feedTitles= feed.getElementsByAttributeValue("class","feed-title");for(Element feedTitle : feedTitles) {
title= feedTitle.getElementsByTag("a");
}
Elements content= feed.getElementsByTag("textarea");
String titleHref= title.attr("href");
String titleText=title.text().trim();
String contentText=content.text().trim();//if(!titleText.contains("人民的名义")){//continue;//}
System.out.println("--------------------");
System.out.println("-----标题-----");
System.out.println("链接:" +titleHref);
System.out.println("内容:" +titleText);
System.out.println("-----内容-----");
System.out.println("内容:" +contentText);
System.out.println("--------------------");
writeStr+= "--------------------\n-----标题-----\n" +titleHref+ "\n" + titleText + "\n-----内容-----\n" +contentText+ "\n--------------------\n\n\n";
}returnwriteStr;
}/*** 模拟HTTP POST请求
*
*@paramoffset
* 参数offset
*@paramstart
* 参数start
*@return请求返回的JSON数据*/
public String doPost(int offset, int start) throwsException {
HttpClient httpClient= newDefaultHttpClient();
HttpUriRequest httpUriRequest=RequestBuilder
.post()
.setUri("https://www.zhihu.com/node/TopStory2FeedList")
.addParameter("params","{\"offset\":" + offset + ",\"start\":\"" +start+ "\"}").addParameter("method", "next").build();//添加必要的头信息
httpUriRequest
.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0");
httpUriRequest.setHeader("X-Xsrftoken","这里的X-Xsrftoken拷贝复制登录后请求头里的X-Xsrftoken值");
httpUriRequest.setHeader("X-Requested-With", "XMLHttpRequest");
httpUriRequest.setHeader("Referer", "https://www.zhihu.com/");
httpUriRequest
.setHeader("Cookie","这里的Cookie拷贝复制登录后请求头里的Cookie值");
httpUriRequest.setHeader("DNT", "1");
httpUriRequest.setHeader("Connection", "keep-alive");
httpUriRequest.setHeader("Cache-Control", "max-age=0");
HttpResponse response=httpClient.execute(httpUriRequest);
String str= "";
HttpEntity entity=response.getEntity();if (entity != null) {
InputStream instreams=entity.getContent();
str=convertStreamToString(instreams);
}returnstr;
}
}